Total coverage: 111139 (7%)of 1824683
1 1 1 1 1 1 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/rfkill.h> #include <linux/nfc.h> #include <net/genetlink.h> #include "nfc.h" #define VERSION "0.1" #define NFC_CHECK_PRES_FREQ_MS 2000 int nfc_devlist_generation; DEFINE_MUTEX(nfc_devlist_mutex); /* NFC device ID bitmap */ static DEFINE_IDA(nfc_index_ida); int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name) { int rc = 0; pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dev_up) { rc = -EBUSY; goto error; } if (!dev->ops->fw_download) { rc = -EOPNOTSUPP; goto error; } dev->fw_download_in_progress = true; rc = dev->ops->fw_download(dev, firmware_name); if (rc) dev->fw_download_in_progress = false; error: device_unlock(&dev->dev); return rc; } /** * nfc_fw_download_done - inform that a firmware download was completed * * @dev: The nfc device to which firmware was downloaded * @firmware_name: The firmware filename * @result: The positive value of a standard errno value */ int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { dev->fw_download_in_progress = false; return nfc_genl_fw_download_done(dev, firmware_name, result); } EXPORT_SYMBOL(nfc_fw_download_done); /** * nfc_dev_up - turn on the NFC device * * @dev: The nfc device to be turned on * * The device remains up until the nfc_dev_down function is called. */ int nfc_dev_up(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->rfkill && rfkill_blocked(dev->rfkill)) { rc = -ERFKILL; goto error; } if (dev->fw_download_in_progress) { rc = -EBUSY; goto error; } if (dev->dev_up) { rc = -EALREADY; goto error; } if (dev->ops->dev_up) rc = dev->ops->dev_up(dev); if (!rc) dev->dev_up = true; /* We have to enable the device before discovering SEs */ if (dev->ops->discover_se && dev->ops->discover_se(dev)) pr_err("SE discovery failed\n"); error: device_unlock(&dev->dev); return rc; } /** * nfc_dev_down - turn off the NFC device * * @dev: The nfc device to be turned off */ int nfc_dev_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -EALREADY; goto error; } if (dev->polling || dev->active_target) { rc = -EBUSY; goto error; } if (dev->ops->dev_down) dev->ops->dev_down(dev); dev->dev_up = false; error: device_unlock(&dev->dev); return rc; } static int nfc_rfkill_set_block(void *data, bool blocked) { struct nfc_dev *dev = data; pr_debug("%s blocked %d", dev_name(&dev->dev), blocked); if (!blocked) return 0; nfc_dev_down(dev); return 0; } static const struct rfkill_ops nfc_rfkill_ops = { .set_block = nfc_rfkill_set_block, }; /** * nfc_start_poll - start polling for nfc targets * * @dev: The nfc device that must start polling * @im_protocols: bitset of nfc initiator protocols to be used for polling * @tm_protocols: bitset of nfc transport protocols to be used for polling * * The device remains polling for targets until a target is found or * the nfc_stop_poll function is called. */ int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols) { int rc; pr_debug("dev_name %s initiator protocols 0x%x target protocols 0x%x\n", dev_name(&dev->dev), im_protocols, tm_protocols); if (!im_protocols && !tm_protocols) return -EINVAL; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } rc = dev->ops->start_poll(dev, im_protocols, tm_protocols); if (!rc) { dev->polling = true; dev->rf_mode = NFC_RF_NONE; } error: device_unlock(&dev->dev); return rc; } /** * nfc_stop_poll - stop polling for nfc targets * * @dev: The nfc device that must stop polling */ int nfc_stop_poll(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->polling) { rc = -EINVAL; goto error; } dev->ops->stop_poll(dev); dev->polling = false; dev->rf_mode = NFC_RF_NONE; error: device_unlock(&dev->dev); return rc; } static struct nfc_target *nfc_find_target(struct nfc_dev *dev, u32 target_idx) { int i; for (i = 0; i < dev->n_targets; i++) { if (dev->targets[i].idx == target_idx) return &dev->targets[i]; } return NULL; } int nfc_dep_link_up(struct nfc_dev *dev, int target_index, u8 comm_mode) { int rc = 0; u8 *gb; size_t gb_len; struct nfc_target *target; pr_debug("dev_name=%s comm %d\n", dev_name(&dev->dev), comm_mode); if (!dev->ops->dep_link_up) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == true) { rc = -EALREADY; goto error; } gb = nfc_llcp_general_bytes(dev, &gb_len); if (gb_len > NFC_MAX_GT_LEN) { rc = -EINVAL; goto error; } target = nfc_find_target(dev, target_index); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->dep_link_up(dev, target, comm_mode, gb, gb_len); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); if (!dev->ops->dep_link_down) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == false) { rc = -EALREADY; goto error; } rc = dev->ops->dep_link_down(dev); if (!rc) { dev->dep_link_up = false; dev->active_target = NULL; dev->rf_mode = NFC_RF_NONE; nfc_llcp_mac_is_down(dev); nfc_genl_dep_link_down_event(dev); } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { dev->dep_link_up = true; if (!dev->active_target && rf_mode == NFC_RF_INITIATOR) { struct nfc_target *target; target = nfc_find_target(dev, target_idx); if (target == NULL) return -ENOTCONN; dev->active_target = target; } dev->polling = false; dev->rf_mode = rf_mode; nfc_llcp_mac_is_up(dev, target_idx, comm_mode, rf_mode); return nfc_genl_dep_link_up_event(dev, target_idx, comm_mode, rf_mode); } EXPORT_SYMBOL(nfc_dep_link_is_up); /** * nfc_activate_target - prepare the target for data exchange * * @dev: The nfc device that found the target * @target_idx: index of the target that must be activated * @protocol: nfc protocol that will be used for data exchange */ int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol) { int rc; struct nfc_target *target; pr_debug("dev_name=%s target_idx=%u protocol=%u\n", dev_name(&dev->dev), target_idx, protocol); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target) { rc = -EBUSY; goto error; } target = nfc_find_target(dev, target_idx); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->activate_target(dev, target, protocol); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; if (dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } error: device_unlock(&dev->dev); return rc; } /** * nfc_deactivate_target - deactivate a nfc target * * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated * @mode: idle or sleep? */ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { int rc = 0; pr_debug("dev_name=%s target_idx=%u\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target == NULL) { rc = -ENOTCONN; goto error; } if (dev->active_target->idx != target_idx) { rc = -ENOTCONN; goto error; } if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); dev->ops->deactivate_target(dev, dev->active_target, mode); dev->active_target = NULL; error: device_unlock(&dev->dev); return rc; } /** * nfc_data_exchange - transceive data * * @dev: The nfc device that found the target * @target_idx: index of the target * @skb: data to be sent * @cb: callback called when the response is received * @cb_context: parameter for the callback function * * The user must wait for the callback before calling this function again. */ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context) { int rc; pr_debug("dev_name=%s target_idx=%u skb->len=%u\n", dev_name(&dev->dev), target_idx, skb->len); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; kfree_skb(skb); goto error; } if (dev->rf_mode == NFC_RF_INITIATOR && dev->active_target != NULL) { if (dev->active_target->idx != target_idx) { rc = -EADDRNOTAVAIL; kfree_skb(skb); goto error; } if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb, cb_context); if (!rc && dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } else if (dev->rf_mode == NFC_RF_TARGET && dev->ops->tm_send != NULL) { rc = dev->ops->tm_send(dev, skb); } else { rc = -ENOTCONN; kfree_skb(skb); goto error; } error: device_unlock(&dev->dev); return rc; } struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; list_for_each_entry(se, &dev->secure_elements, list) if (se->idx == se_idx) return se; return NULL; } EXPORT_SYMBOL(nfc_find_se); int nfc_enable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_ENABLED) { rc = -EALREADY; goto error; } rc = dev->ops->enable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_ENABLED; error: device_unlock(&dev->dev); return rc; } int nfc_disable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_DISABLED) { rc = -EALREADY; goto error; } rc = dev->ops->disable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_DISABLED; error: device_unlock(&dev->dev); return rc; } int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); return nfc_llcp_set_remote_gb(dev, gb, gb_len); } EXPORT_SYMBOL(nfc_set_remote_general_bytes); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len) { pr_debug("dev_name=%s\n", dev_name(&dev->dev)); return nfc_llcp_general_bytes(dev, gb_len); } EXPORT_SYMBOL(nfc_get_local_general_bytes); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) { /* Only LLCP target mode for now */ if (dev->dep_link_up == false) { kfree_skb(skb); return -ENOLINK; } return nfc_llcp_data_received(dev, skb); } EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, const u8 *gb, size_t gb_len) { int rc; device_lock(&dev->dev); dev->polling = false; if (gb != NULL) { rc = nfc_set_remote_general_bytes(dev, gb, gb_len); if (rc < 0) goto out; } dev->rf_mode = NFC_RF_TARGET; if (protocol == NFC_PROTO_NFC_DEP_MASK) nfc_dep_link_is_up(dev, 0, comm_mode, NFC_RF_TARGET); rc = nfc_genl_tm_activated(dev, protocol); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_tm_activated); int nfc_tm_deactivated(struct nfc_dev *dev) { dev->dep_link_up = false; dev->rf_mode = NFC_RF_NONE; return nfc_genl_tm_deactivated(dev); } EXPORT_SYMBOL(nfc_tm_deactivated); /** * nfc_alloc_send_skb - allocate a skb for data exchange responses * * @dev: device sending the response * @sk: socket sending the response * @flags: MSG_DONTWAIT flag * @size: size to allocate * @err: pointer to memory to store the error code */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, unsigned int *err) { struct sk_buff *skb; unsigned int total_size; total_size = size + dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = sock_alloc_send_skb(sk, total_size, flags & MSG_DONTWAIT, err); if (skb) skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); return skb; } /** * nfc_alloc_recv_skb - allocate a skb for data exchange responses * * @size: size to allocate * @gfp: gfp flags */ struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp) { struct sk_buff *skb; unsigned int total_size; total_size = size + 1; skb = alloc_skb(total_size, gfp); if (skb) skb_reserve(skb, 1); return skb; } EXPORT_SYMBOL(nfc_alloc_recv_skb); /** * nfc_targets_found - inform that targets were found * * @dev: The nfc device that found the targets * @targets: array of nfc targets found * @n_targets: targets array size * * The device driver must call this function when one or many nfc targets * are found. After calling this function, the device driver must stop * polling for targets. * NOTE: This function can be called with targets=NULL and n_targets=0 to * notify a driver error, meaning that the polling operation cannot complete. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int n_targets) { int i; pr_debug("dev_name=%s n_targets=%d\n", dev_name(&dev->dev), n_targets); for (i = 0; i < n_targets; i++) targets[i].idx = dev->target_next_idx++; device_lock(&dev->dev); if (dev->polling == false) { device_unlock(&dev->dev); return 0; } dev->polling = false; dev->targets_generation++; kfree(dev->targets); dev->targets = NULL; if (targets) { dev->targets = kmemdup(targets, n_targets * sizeof(struct nfc_target), GFP_ATOMIC); if (!dev->targets) { dev->n_targets = 0; device_unlock(&dev->dev); return -ENOMEM; } } dev->n_targets = n_targets; device_unlock(&dev->dev); nfc_genl_targets_found(dev); return 0; } EXPORT_SYMBOL(nfc_targets_found); /** * nfc_target_lost - inform that an activated target went out of field * * @dev: The nfc device that had the activated target in field * @target_idx: the nfc index of the target * * The device driver must call this function when the activated target * goes out of the field. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { const struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); for (i = 0; i < dev->n_targets; i++) { tg = &dev->targets[i]; if (tg->idx == target_idx) break; } if (i == dev->n_targets) { device_unlock(&dev->dev); return -EINVAL; } dev->targets_generation++; dev->n_targets--; dev->active_target = NULL; if (dev->n_targets) { memcpy(&dev->targets[i], &dev->targets[i + 1], (dev->n_targets - i) * sizeof(struct nfc_target)); } else { kfree(dev->targets); dev->targets = NULL; } device_unlock(&dev->dev); nfc_genl_target_lost(dev, target_idx); return 0; } EXPORT_SYMBOL(nfc_target_lost); inline void nfc_driver_failure(struct nfc_dev *dev, int err) { nfc_targets_found(dev, NULL, 0); } EXPORT_SYMBOL(nfc_driver_failure); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); se = nfc_find_se(dev, se_idx); if (se) return -EALREADY; se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL); if (!se) return -ENOMEM; se->idx = se_idx; se->type = type; se->state = NFC_SE_DISABLED; INIT_LIST_HEAD(&se->list); list_add(&se->list, &dev->secure_elements); rc = nfc_genl_se_added(dev, se_idx, type); if (rc < 0) { list_del(&se->list); kfree(se); return rc; } return 0; } EXPORT_SYMBOL(nfc_add_se); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se, *n; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); list_for_each_entry_safe(se, n, &dev->secure_elements, list) if (se->idx == se_idx) { rc = nfc_genl_se_removed(dev, se_idx); if (rc < 0) return rc; list_del(&se->list); kfree(se); return 0; } return -EINVAL; } EXPORT_SYMBOL(nfc_remove_se); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { int rc; pr_debug("transaction: %x\n", se_idx); device_lock(&dev->dev); if (!evt_transaction) { rc = -EPROTO; goto out; } rc = nfc_genl_se_transaction(dev, se_idx, evt_transaction); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_transaction); int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx) { int rc; pr_debug("connectivity: %x\n", se_idx); device_lock(&dev->dev); rc = nfc_genl_se_connectivity(dev, se_idx); device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_connectivity); static void nfc_release(struct device *d) { struct nfc_dev *dev = to_nfc_dev(d); struct nfc_se *se, *n; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); nfc_genl_data_exit(&dev->genl_data); kfree(dev->targets); list_for_each_entry_safe(se, n, &dev->secure_elements, list) { nfc_genl_se_removed(dev, se->idx); list_del(&se->list); kfree(se); } ida_free(&nfc_index_ida, dev->idx); kfree(dev); } static void nfc_check_pres_work(struct work_struct *work) { struct nfc_dev *dev = container_of(work, struct nfc_dev, check_pres_work); int rc; device_lock(&dev->dev); if (dev->active_target && timer_pending(&dev->check_pres_timer) == 0) { rc = dev->ops->check_presence(dev, dev->active_target); if (rc == -EOPNOTSUPP) goto exit; if (rc) { u32 active_target_idx = dev->active_target->idx; device_unlock(&dev->dev); nfc_target_lost(dev, active_target_idx); return; } if (!dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } exit: device_unlock(&dev->dev); } static void nfc_check_pres_timeout(struct timer_list *t) { struct nfc_dev *dev = from_timer(dev, t, check_pres_timer); schedule_work(&dev->check_pres_work); } const struct class nfc_class = { .name = "nfc", .dev_release = nfc_release, }; EXPORT_SYMBOL(nfc_class); static int match_idx(struct device *d, const void *data) { struct nfc_dev *dev = to_nfc_dev(d); const unsigned int *idx = data; return dev->idx == *idx; } struct nfc_dev *nfc_get_device(unsigned int idx) { struct device *d; d = class_find_device(&nfc_class, NULL, &idx, match_idx); if (!d) return NULL; return to_nfc_dev(d); } /** * nfc_allocate_device - allocate a new nfc device * * @ops: device operations * @supported_protocols: NFC protocols supported by the device * @tx_headroom: reserved space at beginning of skb * @tx_tailroom: reserved space at end of skb */ struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { struct nfc_dev *dev; int rc; if (!ops->start_poll || !ops->stop_poll || !ops->activate_target || !ops->deactivate_target || !ops->im_transceive) return NULL; if (!supported_protocols) return NULL; dev = kzalloc(sizeof(struct nfc_dev), GFP_KERNEL); if (!dev) return NULL; rc = ida_alloc(&nfc_index_ida, GFP_KERNEL); if (rc < 0) goto err_free_dev; dev->idx = rc; dev->dev.class = &nfc_class; dev_set_name(&dev->dev, "nfc%d", dev->idx); device_initialize(&dev->dev); dev->ops = ops; dev->supported_protocols = supported_protocols; dev->tx_headroom = tx_headroom; dev->tx_tailroom = tx_tailroom; INIT_LIST_HEAD(&dev->secure_elements); nfc_genl_data_init(&dev->genl_data); dev->rf_mode = NFC_RF_NONE; /* first generation must not be 0 */ dev->targets_generation = 1; if (ops->check_presence) { timer_setup(&dev->check_pres_timer, nfc_check_pres_timeout, 0); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } return dev; err_free_dev: kfree(dev); return NULL; } EXPORT_SYMBOL(nfc_allocate_device); /** * nfc_register_device - register a nfc device in the nfc subsystem * * @dev: The nfc device to register */ int nfc_register_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; rc = device_add(&dev->dev); mutex_unlock(&nfc_devlist_mutex); if (rc < 0) return rc; rc = nfc_llcp_register_device(dev); if (rc) pr_err("Could not register llcp device\n"); device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { if (rfkill_register(dev->rfkill) < 0) { rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } } dev->shutting_down = false; device_unlock(&dev->dev); rc = nfc_genl_device_added(dev); if (rc) pr_debug("The userspace won't be notified that the device %s was added\n", dev_name(&dev->dev)); return 0; } EXPORT_SYMBOL(nfc_register_device); /** * nfc_unregister_device - unregister a nfc device in the nfc subsystem * * @dev: The nfc device to unregister */ void nfc_unregister_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); rc = nfc_genl_device_removed(dev); if (rc) pr_debug("The userspace won't be notified that the device %s " "was removed\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } dev->shutting_down = true; device_unlock(&dev->dev); if (dev->ops->check_presence) { del_timer_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); } nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; device_del(&dev->dev); mutex_unlock(&nfc_devlist_mutex); } EXPORT_SYMBOL(nfc_unregister_device); static int __init nfc_init(void) { int rc; pr_info("NFC Core ver %s\n", VERSION); rc = class_register(&nfc_class); if (rc) return rc; rc = nfc_genl_init(); if (rc) goto err_genl; /* the first generation must not be 0 */ nfc_devlist_generation = 1; rc = rawsock_init(); if (rc) goto err_rawsock; rc = nfc_llcp_init(); if (rc) goto err_llcp_sock; rc = af_nfc_init(); if (rc) goto err_af_nfc; return 0; err_af_nfc: nfc_llcp_exit(); err_llcp_sock: rawsock_exit(); err_rawsock: nfc_genl_exit(); err_genl: class_unregister(&nfc_class); return rc; } static void __exit nfc_exit(void) { af_nfc_exit(); nfc_llcp_exit(); rawsock_exit(); nfc_genl_exit(); class_unregister(&nfc_class); } subsys_initcall(nfc_init); module_exit(nfc_exit); MODULE_AUTHOR("Lauro Ramos Venancio <lauro.venancio@openbossa.org>"); MODULE_DESCRIPTION("NFC Core ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NFC); MODULE_ALIAS_GENL_FAMILY(NFC_GENL_NAME);
14 14 1 6 7 6 7 12 1 1 12 6 7 39 15 24 5 19 39 39 14 1 14 1 24 39 32 9 36 3 125 3 123 3 17 6 6 4 4 4 5 1 24 24 1 23 4 1 3 1 1 1 1 1 5 5 5 5 5 5 5 191 74 75 187 93 97 38 75 75 188 188 190 191 54 143 17 9 8 8 8 7 75 73 2 1 1 1 1 1 1 130 130 11 204 8 8 8 94 93 3 136 136 103 29 57 3 14 94 105 105 105 105 105 105 33 33 83 105 105 8 33 74 102 102 24 83 75 33 82 26 102 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/xattr.h> #include <linux/ratelimit.h> #include <linux/fiemap.h> #include <linux/fileattr.h> #include <linux/security.h> #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include "overlayfs.h" int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int err; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool full_copy_up = false; struct dentry *upperdentry; const struct cred *old_cred; err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) return err; if (attr->ia_valid & ATTR_SIZE) { /* Truncate should trigger data copy up as well */ full_copy_up = true; } if (!full_copy_up) err = ovl_copy_up(dentry); else err = ovl_copy_up_with_data(dentry); if (!err) { struct inode *winode = NULL; upperdentry = ovl_dentry_upper(dentry); if (attr->ia_valid & ATTR_SIZE) { winode = d_inode(upperdentry); err = get_write_access(winode); if (err) goto out; } if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; /* * We might have to translate ovl file into real file object * once use cases emerge. For now, simply don't let underlying * filesystem rely on attr->ia_file */ attr->ia_valid &= ~ATTR_FILE; /* * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN * set. Overlayfs does not pass O_TRUNC flag to underlying * filesystem during open -> do not pass ATTR_OPEN. This * disables optimization in fuse which assumes open(O_TRUNC) * already set file size to 0. But we never passed O_TRUNC to * fuse. So by clearing ATTR_OPEN, fuse will be forced to send * setattr request to server. */ attr->ia_valid &= ~ATTR_OPEN; err = ovl_want_write(dentry); if (err) goto out_put_write; inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = ovl_do_notify_change(ofs, upperdentry, attr); ovl_revert_creds(old_cred); if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); ovl_drop_write(dentry); out_put_write: if (winode) put_write_access(winode); } out: return err; } static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool samefs = ovl_same_fs(ofs); unsigned int xinobits = ovl_xino_bits(ofs); unsigned int xinoshift = 64 - xinobits; if (samefs) { /* * When all layers are on the same fs, all real inode * number are unique, so we use the overlay st_dev, * which is friendly to du -x. */ stat->dev = dentry->d_sb->s_dev; return; } else if (xinobits) { /* * All inode numbers of underlying fs should not be using the * high xinobits, so we use high xinobits to partition the * overlay st_ino address space. The high bits holds the fsid * (upper fsid is 0). The lowest xinobit is reserved for mapping * the non-persistent inode numbers range in case of overflow. * This way all overlay inode numbers are unique and use the * overlay st_dev. */ if (likely(!(stat->ino >> xinoshift))) { stat->ino |= ((u64)fsid) << (xinoshift + 1); stat->dev = dentry->d_sb->s_dev; return; } else if (ovl_xino_warn(ofs)) { pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); } } /* The inode could not be mapped to a unified st_ino address space */ if (S_ISDIR(dentry->d_inode->i_mode)) { /* * Always use the overlay st_dev for directories, so 'find * -xdev' will scan the entire overlay mount and won't cross the * overlay mount boundaries. * * If not all layers are on the same fs the pair {real st_ino; * overlay st_dev} is not unique, so use the non persistent * overlay st_ino for directories. */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; } else { /* * For non-samefs setup, if we cannot map all layers st_ino * to a unified address space, we need to make sure that st_dev * is unique per underlying fs, so we use the unique anonymous * bdev assigned to the underlying fs. */ stat->dev = ofs->fs[fsid].pseudo_dev; } } int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; enum ovl_path_type type; struct path realpath; const struct cred *old_cred; struct inode *inode = d_inode(dentry); bool is_dir = S_ISDIR(inode->i_mode); int fsid = 0; int err; bool metacopy_blocks = false; metacopy_blocks = ovl_is_metacopy_dentry(dentry); type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getattr_nosec(&realpath, stat, request_mask, flags); if (err) goto out; /* Report the effective immutable/append-only STATX flags */ generic_fill_statx_attr(inode, stat); /* * For non-dir or same fs, we use st_ino of the copy up origin. * This guaranties constant st_dev/st_ino across copy up. * With xino feature and non-samefs, we use st_ino of the copy up * origin masked with high bits that represent the layer id. * * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ if (!is_dir || ovl_same_dev(OVL_FS(dentry->d_sb))) { if (!OVL_TYPE_UPPER(type)) { fsid = ovl_layer_lower(dentry)->fsid; } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | STATX_BLOCKS | (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask, flags); if (err) goto out; /* * Lower hardlinks may be broken on copy up to different * upper files, so we cannot use the lower origin st_ino * for those different files, even for the same fs case. * * Similarly, several redirected dirs can point to the * same dir on a lower layer. With the "verify_lower" * feature, we do not use the lower origin st_ino, if * we haven't verified that this redirect is unique. * * With inodes index enabled, it is safe to use st_ino * of an indexed origin. The index validates that the * upper hardlink is not broken and that a redirected * dir is the only redirect to that origin. */ if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { fsid = ovl_layer_lower(dentry)->fsid; stat->ino = lowerstat.ino; } /* * If we are querying a metacopy dentry and lower * dentry is data dentry, then use the blocks we * queried just now. We don't have to do additional * vfs_getattr(). If lower itself is metacopy, then * additional vfs_getattr() is unavoidable. */ if (metacopy_blocks && realpath.dentry == ovl_dentry_lowerdata(dentry)) { stat->blocks = lowerstat.blocks; metacopy_blocks = false; } } if (metacopy_blocks) { /* * If lower is not same as lowerdata or if there was * no origin on upper, we can end up here. * With lazy lowerdata lookup, guess lowerdata blocks * from size to avoid lowerdata lookup on stat(2). */ struct kstat lowerdatastat; u32 lowermask = STATX_BLOCKS; ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { err = vfs_getattr_nosec(&realpath, &lowerdatastat, lowermask, flags); if (err) goto out; } else { lowerdatastat.blocks = round_up(stat->size, stat->blksize) >> 9; } stat->blocks = lowerdatastat.blocks; } } ovl_map_dev_ino(dentry, stat, fsid); /* * It's probably not worth it to count subdirs to get the * correct link count. nlink=1 seems to pacify 'find' and * other utilities. */ if (is_dir && OVL_TYPE_MERGE(type)) stat->nlink = 1; /* * Return the overlay inode nlinks for indexed upper inodes. * Overlay inode nlink counts the union of the upper hardlinks * and non-covered lower hardlinks. It does not include the upper * index hardlink. */ if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) stat->nlink = dentry->d_inode->i_nlink; out: ovl_revert_creds(old_cred); return err; } int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); struct inode *realinode; struct path realpath; const struct cred *old_cred; int err; /* Careful in RCU walk mode */ realinode = ovl_i_path_real(inode, &realpath); if (!realinode) { WARN_ON(!(mask & MAY_NOT_BLOCK)); return -ECHILD; } /* * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ err = generic_permission(&nop_mnt_idmap, inode, mask); if (err) return err; old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); ovl_revert_creds(old_cred); return err; } static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { const struct cred *old_cred; const char *p; if (!dentry) return ERR_PTR(-ECHILD); old_cred = ovl_override_creds(dentry->d_sb); p = vfs_get_link(ovl_dentry_real(dentry), done); ovl_revert_creds(old_cred); return p; } #ifdef CONFIG_FS_POSIX_ACL /* * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ static void ovl_idmap_posix_acl(const struct inode *realinode, struct mnt_idmap *idmap, struct posix_acl *acl) { struct user_namespace *fs_userns = i_user_ns(realinode); for (unsigned int i = 0; i < acl->a_count; i++) { vfsuid_t vfsuid; vfsgid_t vfsgid; struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } } } /* * The @noperm argument is used to skip permission checking and is a temporary * measure. Quoting Miklos from an earlier discussion: * * > So there are two paths to getting an acl: * > 1) permission checking and 2) retrieving the value via getxattr(2). * > This is a similar situation as reading a symlink vs. following it. * > When following a symlink overlayfs always reads the link on the * > underlying fs just as if it was a readlink(2) call, calling * > security_inode_readlink() instead of security_inode_follow_link(). * > This is logical: we are reading the link from the underlying storage, * > and following it on overlayfs. * > * > Applying the same logic to acl: we do need to call the * > security_inode_getxattr() on the underlying fs, even if just want to * > check permissions on overlay. This is currently not done, which is an * > inconsistency. * > * > Maybe adding the check to ovl_get_acl() is the right way to go, but * > I'm a little afraid of a performance regression. Will look into that. * * Until we have made a decision allow this helper to take the @noperm * argument. We should hopefully be able to remove it soon. */ struct posix_acl *ovl_get_acl_path(const struct path *path, const char *acl_name, bool noperm) { struct posix_acl *real_acl, *clone; struct mnt_idmap *idmap; struct inode *realinode = d_inode(path->dentry); idmap = mnt_idmap(path->mnt); if (noperm) real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); else real_acl = vfs_get_acl(idmap, path->dentry, acl_name); if (IS_ERR_OR_NULL(real_acl)) return real_acl; if (!is_idmapped_mnt(path->mnt)) return real_acl; /* * We cannot alter the ACLs returned from the relevant layer as that * would alter the cached values filesystem wide for the lower * filesystem. Instead we can clone the ACLs and then apply the * relevant idmapping of the layer. */ clone = posix_acl_clone(real_acl, GFP_KERNEL); posix_acl_release(real_acl); /* release original acl */ if (!clone) return ERR_PTR(-ENOMEM); ovl_idmap_posix_acl(realinode, idmap, clone); return clone; } /* * When the relevant layer is an idmapped mount we need to take the idmapping * of the layer into account and translate any ACL_{GROUP,USER} values * according to the idmapped mount. * * We cannot alter the ACLs returned from the relevant layer as that would * alter the cached values filesystem wide for the lower filesystem. Instead we * can clone the ACLs and then apply the relevant idmapping of the layer. * * This is obviously only relevant when idmapped layers are used. */ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm) { struct inode *realinode; struct posix_acl *acl; struct path realpath; /* Careful in RCU walk mode */ realinode = ovl_i_path_real(inode, &realpath); if (!realinode) { WARN_ON(!rcu); return ERR_PTR(-ECHILD); } if (!IS_POSIXACL(realinode)) return NULL; if (rcu) { /* * If the layer is idmapped drop out of RCU path walk * so we can clone the ACLs. */ if (is_idmapped_mnt(realpath.mnt)) return ERR_PTR(-ECHILD); acl = get_cached_acl_rcu(realinode, type); } else { const struct cred *old_cred; old_cred = ovl_override_creds(inode->i_sb); acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); ovl_revert_creds(old_cred); } return acl; } static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct posix_acl *acl, int type) { int err; struct path realpath; const char *acl_name; const struct cred *old_cred; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); /* * If ACL is to be removed from a lower file, check if it exists in * the first place before copying it up. */ acl_name = posix_acl_xattr_name(type); if (!acl && !upperdentry) { struct posix_acl *real_acl; ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); ovl_revert_creds(old_cred); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); goto out; } posix_acl_release(real_acl); } if (!upperdentry) { err = ovl_copy_up(dentry); if (err) goto out; realdentry = ovl_dentry_upper(dentry); } err = ovl_want_write(dentry); if (err) goto out; old_cred = ovl_override_creds(dentry->d_sb); if (acl) err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else err = ovl_do_remove_acl(ofs, realdentry, acl_name); ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); out: return err; } int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int err; struct inode *inode = d_inode(dentry); struct dentry *workdir = ovl_workdir(dentry); struct inode *realinode = ovl_inode_real(inode); if (!IS_POSIXACL(d_inode(workdir))) return -EOPNOTSUPP; if (!realinode->i_op->set_acl) return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; /* * Check if sgid bit needs to be cleared (actual setacl operation will * be done with mounter's capabilities and so that won't do it for us). */ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); if (err) return err; } return ovl_set_or_remove_acl(dentry, inode, acl, type); } #endif int ovl_update_time(struct inode *inode, int flags) { if (flags & S_ATIME) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = ovl_upperdentry_dereference(OVL_I(inode)), }; if (upperpath.dentry) { touch_atime(&upperpath); inode_set_atime_to_ts(inode, inode_get_atime(d_inode(upperpath.dentry))); } } return 0; } static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int err; struct inode *realinode = ovl_inode_realdata(inode); const struct cred *old_cred; if (!realinode) return -EIO; if (!realinode->i_op->fiemap) return -EOPNOTSUPP; old_cred = ovl_override_creds(inode->i_sb); err = realinode->i_op->fiemap(realinode, fieinfo, start, len); ovl_revert_creds(old_cred); return err; } /* * Work around the fact that security_file_ioctl() takes a file argument. * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa, bool set) { struct file *file; unsigned int cmd; int err; unsigned int flags; flags = O_RDONLY; if (force_o_largefile()) flags |= O_LARGEFILE; file = dentry_open(realpath, flags, current_cred()); if (IS_ERR(file)) return PTR_ERR(file); if (set) cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; else cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; err = security_file_ioctl(file, cmd, 0); fput(file); return err; } int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) { int err; err = ovl_security_fileattr(realpath, fa, true); if (err) return err; return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); } int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct path upperpath; const struct cred *old_cred; unsigned int flags; int err; err = ovl_copy_up(dentry); if (!err) { ovl_path_real(dentry, &upperpath); err = ovl_want_write(dentry); if (err) goto out; old_cred = ovl_override_creds(inode->i_sb); /* * Store immutable/append-only flags in xattr and clear them * in upper fileattr (in case they were set by older kernel) * so children of "ovl-immutable" directories lower aliases of * "ovl-immutable" hardlinks could be copied up. * Clear xattr when flags are cleared. */ err = ovl_set_protattr(inode, upperpath.dentry, fa); if (!err) err = ovl_real_fileattr_set(&upperpath, fa); ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* * Merge real inode flags with inode flags read from * overlay.protattr xattr */ flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); /* Update ctime */ ovl_copyattr(inode); } out: return err; } /* Convert inode protection flags to fileattr flags */ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) { BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); if (inode->i_flags & S_APPEND) { fa->flags |= FS_APPEND_FL; fa->fsx_xflags |= FS_XFLAG_APPEND; } if (inode->i_flags & S_IMMUTABLE) { fa->flags |= FS_IMMUTABLE_FL; fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; } } int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) { int err; err = ovl_security_fileattr(realpath, fa, false); if (err) return err; err = vfs_fileattr_get(realpath->dentry, fa); if (err == -ENOIOCTLCMD) err = -ENOTTY; return err; } int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct path realpath; const struct cred *old_cred; int err; ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(inode->i_sb); err = ovl_real_fileattr_get(&realpath, fa); ovl_fileattr_prot_flags(inode, fa); ovl_revert_creds(old_cred); return err; } static const struct inode_operations ovl_file_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, }; static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .update_time = ovl_update_time, }; static const struct inode_operations ovl_special_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, .set_acl = ovl_set_acl, .update_time = ovl_update_time, }; static const struct address_space_operations ovl_aops = { /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ .direct_IO = noop_direct_IO, }; /* * It is possible to stack overlayfs instance on top of another * overlayfs instance as lower layer. We need to annotate the * stackable i_mutex locks according to stack level of the super * block instance. An overlayfs instance can never be in stack * depth 0 (there is always a real fs below it). An overlayfs * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. * * For example, here is a snip from /proc/lockdep_chains after * dir_iterate of nested overlayfs: * * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) * [...] &type->i_mutex_dir_key (stack_depth=0) * * Locking order w.r.t ovl_want_write() is important for nested overlayfs. * * This chain is valid: * - inode->i_rwsem (inode_lock[2]) * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(inode)->lock (ovl_inode_lock[2]) * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) * * And this chain is valid: * - inode->i_rwsem (inode_lock[2]) * - OVL_I(inode)->lock (ovl_inode_lock[2]) * - lowerinode->i_rwsem (inode_lock[1]) * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) * * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is * held, because it is in reverse order of the non-nested case using the same * upper fs: * - inode->i_rwsem (inode_lock[1]) * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) * - OVL_I(inode)->lock (ovl_inode_lock[1]) */ #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth - 1; if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) depth = 0; if (S_ISDIR(inode->i_mode)) lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); else lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); #endif } static void ovl_next_ino(struct inode *inode) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); inode->i_ino = atomic_long_inc_return(&ofs->last_ino); if (unlikely(!inode->i_ino)) inode->i_ino = atomic_long_inc_return(&ofs->last_ino); } static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); int xinobits = ovl_xino_bits(ofs); unsigned int xinoshift = 64 - xinobits; /* * When d_ino is consistent with st_ino (samefs or i_ino has enough * bits to encode layer), set the same value used for st_ino to i_ino, * so inode number exposed via /proc/locks and a like will be * consistent with d_ino and st_ino values. An i_ino value inconsistent * with d_ino also causes nfsd readdirplus to fail. */ inode->i_ino = ino; if (ovl_same_fs(ofs)) { return; } else if (xinobits && likely(!(ino >> xinoshift))) { inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); return; } /* * For directory inodes on non-samefs with xino disabled or xino * overflow, we allocate a non-persistent inode number, to be used for * resolving st_ino collisions in ovl_map_dev_ino(). * * To avoid ino collision with legitimate xino values from upper * layer (fsid 0), use the lowest xinobit to map the non * persistent inode numbers to the unified st_ino address space. */ if (S_ISDIR(inode->i_mode)) { ovl_next_ino(inode); if (xinobits) { inode->i_ino &= ~0UL >> xinobits; inode->i_ino |= 1UL << xinoshift; } } } void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, unsigned long ino, int fsid) { struct inode *realinode; struct ovl_inode *oi = OVL_I(inode); oi->__upperdentry = oip->upperdentry; oi->oe = oip->oe; oi->redirect = oip->redirect; oi->lowerdata_redirect = oip->lowerdata_redirect; realinode = ovl_inode_real(inode); ovl_copyattr(inode); ovl_copyflags(realinode, inode); ovl_map_ino(inode, ino, fsid); } static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; #endif ovl_lockdep_annotate_inode_mutex_key(inode); switch (mode & S_IFMT) { case S_IFREG: inode->i_op = &ovl_file_inode_operations; inode->i_fop = &ovl_file_operations; inode->i_mapping->a_ops = &ovl_aops; break; case S_IFDIR: inode->i_op = &ovl_dir_inode_operations; inode->i_fop = &ovl_dir_operations; break; case S_IFLNK: inode->i_op = &ovl_symlink_inode_operations; break; default: inode->i_op = &ovl_special_inode_operations; init_special_inode(inode, mode, rdev); break; } } /* * With inodes index enabled, an overlay inode nlink counts the union of upper * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure * upper inode, the following nlink modifying operations can happen: * * 1. Lower hardlink copy up * 2. Upper hardlink created, unlinked or renamed over * 3. Lower hardlink whiteout or renamed over * * For the first, copy up case, the union nlink does not change, whether the * operation succeeds or fails, but the upper inode nlink may change. * Therefore, before copy up, we store the union nlink value relative to the * lower inode nlink in the index inode xattr .overlay.nlink. * * For the second, upper hardlink case, the union nlink should be incremented * or decremented IFF the operation succeeds, aligned with nlink change of the * upper inode. Therefore, before link/unlink/rename, we store the union nlink * value relative to the upper inode nlink in the index inode. * * For the last, lower cover up case, we simplify things by preceding the * whiteout or cover up with copy up. This makes sure that there is an index * upper inode where the nlink xattr can be stored before the copied up upper * entry is unlink. */ #define OVL_NLINK_ADD_UPPER (1 << 0) /* * On-disk format for indexed nlink: * * nlink relative to the upper inode - "U[+-]NUM" * nlink relative to the lower inode - "L[+-]NUM" */ static int ovl_set_nlink_common(struct dentry *dentry, struct dentry *realdentry, const char *format) { struct inode *inode = d_inode(dentry); struct inode *realinode = d_inode(realdentry); char buf[13]; int len; len = snprintf(buf, sizeof(buf), format, (int) (inode->i_nlink - realinode->i_nlink)); if (WARN_ON(len >= sizeof(buf))) return -EIO; return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), OVL_XATTR_NLINK, buf, len); } int ovl_set_nlink_upper(struct dentry *dentry) { return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); } int ovl_set_nlink_lower(struct dentry *dentry) { return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); } unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback) { int nlink_diff; int nlink; char buf[13]; int err; if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) return fallback; err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1); if (err < 0) goto fail; buf[err] = '\0'; if ((buf[0] != 'L' && buf[0] != 'U') || (buf[1] != '+' && buf[1] != '-')) goto fail; err = kstrtoint(buf + 1, 10, &nlink_diff); if (err < 0) goto fail; nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; nlink += nlink_diff; if (nlink <= 0) goto fail; return nlink; fail: pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", upperdentry, err); return fallback; } struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) { struct inode *inode; inode = new_inode(sb); if (inode) ovl_fill_inode(inode, mode, rdev); return inode; } static int ovl_inode_test(struct inode *inode, void *data) { return inode->i_private == data; } static int ovl_inode_set(struct inode *inode, void *data) { inode->i_private = data; return 0; } static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, struct dentry *upperdentry, bool strict) { /* * For directories, @strict verify from lookup path performs consistency * checks, so NULL lower/upper in dentry must match NULL lower/upper in * inode. Non @strict verify from NFS handle decode path passes NULL for * 'unknown' lower/upper. */ if (S_ISDIR(inode->i_mode) && strict) { /* Real lower dir moved to upper layer under us? */ if (!lowerdentry && ovl_inode_lower(inode)) return false; /* Lookup of an uncovered redirect origin? */ if (!upperdentry && ovl_inode_upper(inode)) return false; } /* * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. * This happens when finding a copied up overlay inode for a renamed * or hardlinked overlay dentry and lower dentry cannot be followed * by origin because lower fs does not support file handles. */ if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) return false; /* * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. * This happens when finding a lower alias for a copied up hard link. */ if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) return false; return true; } struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper) { struct inode *inode, *key = d_inode(real); inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); if (!inode) return NULL; if (!ovl_verify_inode(inode, is_upper ? NULL : real, is_upper ? real : NULL, false)) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) { struct inode *key = d_inode(dir); struct inode *trap; bool res; trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); if (!trap) return false; res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && !ovl_inode_lower(trap); iput(trap); return res; } /* * Create an inode cache entry for layer root dir, that will intentionally * fail ovl_verify_inode(), so any lookup that will find some layer root * will fail. */ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) { struct inode *key = d_inode(dir); struct inode *trap; if (!d_is_dir(dir)) return ERR_PTR(-ENOTDIR); trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, ovl_inode_set, key); if (!trap) return ERR_PTR(-ENOMEM); if (!(trap->i_state & I_NEW)) { /* Conflicting layer roots? */ iput(trap); return ERR_PTR(-ELOOP); } trap->i_mode = S_IFDIR; trap->i_flags = S_DEAD; unlock_new_inode(trap); return trap; } /* * Does overlay inode need to be hashed by lower inode? */ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, struct dentry *lower, bool index) { struct ovl_fs *ofs = OVL_FS(sb); /* No, if pure upper */ if (!lower) return false; /* Yes, if already indexed */ if (index) return true; /* Yes, if won't be copied up */ if (!ovl_upper_mnt(ofs)) return true; /* No, if lower hardlink is or will be broken on copy up */ if ((upper || !ovl_indexdir(sb)) && !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) return false; /* No, if non-indexed upper with NFS export */ if (ofs->config.nfs_export && upper) return false; /* Otherwise, hash by lower inode for fsnotify */ return true; } static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, struct inode *key) { return newinode ? inode_insert5(newinode, (unsigned long) key, ovl_inode_test, ovl_inode_set, key) : iget5_locked(sb, (unsigned long) key, ovl_inode_test, ovl_inode_set, key); } struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip) { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *upperdentry = oip->upperdentry; struct ovl_path *lowerpath = ovl_lowerpath(oip->oe); struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; struct path realpath = { .dentry = upperdentry ?: lowerdentry, .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, }; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir; unsigned long ino = 0; int err = oip->newinode ? -EEXIST : -ENOMEM; if (!realinode) realinode = d_inode(lowerdentry); /* * Copy up origin (lower) may exist for non-indexed upper, but we must * not use lower as hash key if this is a broken hardlink. */ is_dir = S_ISDIR(realinode->i_mode); if (upperdentry || bylower) { struct inode *key = d_inode(bylower ? lowerdentry : upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_err; if (!(inode->i_state & I_NEW)) { /* * Verify that the underlying files stored in the inode * match those in the dentry. */ if (!ovl_verify_inode(inode, lowerdentry, upperdentry, true)) { iput(inode); err = -ESTALE; goto out_err; } dput(upperdentry); ovl_free_entry(oip->oe); kfree(oip->redirect); kfree(oip->lowerdata_redirect); goto out; } /* Recalculate nlink for non-dir due to indexing */ if (!is_dir) nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); ino = key->i_ino; } else { /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) { err = -ENOMEM; goto out_err; } ino = realinode->i_ino; fsid = lowerpath->layer->fsid; } ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); ovl_inode_init(inode, oip, ino, fsid); if (upperdentry && ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, inode); if (oip->index) ovl_set_flag(OVL_INDEX, inode); if (bylower) ovl_set_flag(OVL_CONST_INO, inode); /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || ovl_numlower(oip->oe) > 1) || ovl_path_check_origin_xattr(ofs, &realpath)) { ovl_set_flag(OVL_WHITEOUTS, inode); } } /* Check for immutable/append-only inode flags in xattr */ if (upperdentry) ovl_check_protattr(inode, upperdentry); if (inode->i_state & I_NEW) unlock_new_inode(inode); out: return inode; out_err: pr_warn_ratelimited("failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; }
2 2 1 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 2 1 2 5 6 6 2 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include "cgroup-internal.h" #include <trace/events/cgroup.h> /* * Update CGRP_FROZEN of cgroup.flag * Return true if flags is updated; false if flags has no change */ static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen) { lockdep_assert_held(&css_set_lock); /* Already there? */ if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen) return false; if (frozen) set_bit(CGRP_FROZEN, &cgrp->flags); else clear_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); return true; } /* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { int desc = 1; /* * If the new state is frozen, some freezing ancestor cgroups may change * their state too, depending on if all their descendants are frozen. * * Otherwise, all ancestor cgroups are forced into the non-frozen state. */ while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; if (!test_bit(CGRP_FREEZE, &cgrp->flags) || (cgrp->freezer.nr_frozen_descendants != cgrp->nr_descendants)) continue; } else { cgrp->freezer.nr_frozen_descendants -= desc; } if (cgroup_update_frozen_flag(cgrp, frozen)) desc++; } } /* * Revisit the cgroup frozen state. * Checks if the cgroup is really frozen and perform all state transitions. */ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider * the cgroup frozen. Otherwise it's not frozen. */ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); /* If flags is updated, update the state of ancestor cgroups. */ if (cgroup_update_frozen_flag(cgrp, frozen)) cgroup_propagate_frozen(cgrp, frozen); } /* * Increment cgroup's nr_frozen_tasks. */ static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks++; } /* * Decrement cgroup's nr_frozen_tasks. */ static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks--; WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } /* * Enter frozen/stopped state, if not yet there. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. */ void cgroup_enter_frozen(void) { struct cgroup *cgrp; if (current->frozen) return; spin_lock_irq(&css_set_lock); current->frozen = true; cgrp = task_dfl_cgroup(current); cgroup_inc_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Conditionally leave frozen/stopped state. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. * * If always_leave is not set, and the cgroup is freezing, * we're racing with the cgroup freezing. In this case, we don't * drop the frozen counter to avoid a transient switch to * the unfrozen state. */ void cgroup_leave_frozen(bool always_leave) { struct cgroup *cgrp; spin_lock_irq(&css_set_lock); cgrp = task_dfl_cgroup(current); if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { cgroup_dec_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); WARN_ON_ONCE(!current->frozen); current->frozen = false; } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { spin_lock(&current->sighand->siglock); current->jobctl |= JOBCTL_TRAP_FREEZE; set_thread_flag(TIF_SIGPENDING); spin_unlock(&current->sighand->siglock); } spin_unlock_irq(&css_set_lock); } /* * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE * jobctl bit. */ static void cgroup_freeze_task(struct task_struct *task, bool freeze) { unsigned long flags; /* If the task is about to die, don't bother with freezing it. */ if (!lock_task_sighand(task, &flags)) return; if (freeze) { task->jobctl |= JOBCTL_TRAP_FREEZE; signal_wake_up(task, false); } else { task->jobctl &= ~JOBCTL_TRAP_FREEZE; wake_up_process(task); } unlock_task_sighand(task, &flags); } /* * Freeze or unfreeze all tasks in the given cgroup. */ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); if (freeze) set_bit(CGRP_FREEZE, &cgrp->flags); else clear_bit(CGRP_FREEZE, &cgrp->flags); spin_unlock_irq(&css_set_lock); if (freeze) TRACE_CGROUP_PATH(freeze, cgrp); else TRACE_CGROUP_PATH(unfreeze, cgrp); css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { /* * Ignore kernel threads here. Freezing cgroups containing * kthreads isn't supported. */ if (task->flags & PF_KTHREAD) continue; cgroup_freeze_task(task, freeze); } css_task_iter_end(&it); /* * Cgroup state should be revisited here to cover empty leaf cgroups * and cgroups which descendants are already in the desired state. */ spin_lock_irq(&css_set_lock); if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Adjust the task state (freeze or unfreeze) and revisit the state of * source and destination cgroups. */ void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst) { lockdep_assert_held(&css_set_lock); /* * Kernel threads are not supposed to be frozen at all. */ if (task->flags & PF_KTHREAD) return; /* * It's not necessary to do changes if both of the src and dst cgroups * are not freezing and task is not frozen. */ if (!test_bit(CGRP_FREEZE, &src->flags) && !test_bit(CGRP_FREEZE, &dst->flags) && !task->frozen) return; /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not * frozen, we bump both counters to keep them balanced. */ if (task->frozen) { cgroup_inc_frozen_cnt(dst); cgroup_dec_frozen_cnt(src); } cgroup_update_frozen(dst); cgroup_update_frozen(src); /* * Force the task to the desired state. */ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; struct cgroup *parent; struct cgroup *dsct; bool applied = false; bool old_e; lockdep_assert_held(&cgroup_mutex); /* * Nothing changed? Just exit. */ if (cgrp->freezer.freeze == freeze) return; cgrp->freezer.freeze = freeze; /* * Propagate changes downwards the cgroup tree. */ css_for_each_descendant_pre(css, &cgrp->self) { dsct = css->cgroup; if (cgroup_is_dead(dsct)) continue; /* * e_freeze is affected by parent's e_freeze and dst's freeze. * If old e_freeze eq new e_freeze, no change, its children * will not be affected. So do nothing and skip the subtree */ old_e = dsct->freezer.e_freeze; parent = cgroup_parent(dsct); dsct->freezer.e_freeze = (dsct->freezer.freeze || parent->freezer.e_freeze); if (dsct->freezer.e_freeze == old_e) { css = css_rightmost_descendant(css); continue; } /* * Do change actual state: freeze or unfreeze. */ cgroup_do_freeze(dsct, freeze); applied = true; } /* * Even if the actual state hasn't changed, let's notify a user. * The state can be enforced by an ancestor cgroup: the cgroup * can already be in the desired state or it can be locked in the * opposite state, so that the transition will never happen. * In both cases it's better to notify a user, that there is * nothing to wait for. */ if (!applied) { TRACE_CGROUP_PATH(notify_frozen, cgrp, test_bit(CGRP_FROZEN, &cgrp->flags)); cgroup_file_notify(&cgrp->events_file); } }
39 38 19 40 37 32 62 40 38 36 16 97 98 97 2 37 37 38 35 34 34 88 80 82 1 1 1 1 40 37 38 2 2 127 1 2 96 125 2 46 45 41 41 38 38 38 5 35 36 1 3 33 26 26 26 6 8 5 6 6 9 62 62 2 2 2 2 2 2 2 2 2 2 3 3 2 1 1 3 3 3 1 2 2 42 42 5 38 1 2 3 3 2 37 3 34 22 1 31 24 31 10 1 9 4 3 3 1 2 2 1 1 1 2 2 1 1 2 1 1 2 11 11 42 5 5 1 4 5 5 5 89 89 77 89 89 4 4 4 60 60 60 60 60 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/ratelimit.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; struct ovl_cache_entry *next_maybe_whiteout; bool is_upper; bool is_whiteout; bool check_xwhiteout; char name[]; }; struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; int count; int err; bool is_upper; bool d_type_supported; bool in_xwhiteouts_dir; }; struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; struct list_head *cursor; struct file *realfile; struct file *upperfile; }; static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { return rb_entry(n, struct ovl_cache_entry, node); } static bool ovl_cache_entry_find_link(const char *name, int len, struct rb_node ***link, struct rb_node **parent) { bool found = false; struct rb_node **newp = *link; while (!found && *newp) { int cmp; struct ovl_cache_entry *tmp; *parent = *newp; tmp = ovl_cache_entry_from_node(*newp); cmp = strncmp(name, tmp->name, len); if (cmp > 0) newp = &tmp->node.rb_right; else if (cmp < 0 || len < tmp->len) newp = &tmp->node.rb_left; else found = true; } *link = newp; return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, const char *name, int len) { struct rb_node *node = root->rb_node; int cmp; while (node) { struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); cmp = strncmp(name, p->name, len); if (cmp > 0) node = p->node.rb_right; else if (cmp < 0 || len < p->len) node = p->node.rb_left; else return p; } return NULL; } static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p) { /* Don't care if not doing ovl_iter() */ if (!rdd->dentry) return false; /* Always recalc d_ino when remapping lower inode numbers */ if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb))) return true; /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; /* If this is lower, then native d_ino will do */ if (!rdd->is_upper) return false; /* * Recalc d_ino for '.' and for all entries if dir is impure (contains * copied up entries) */ if ((p->name[0] == '.' && p->len == 1) || ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) return true; return false; } static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); p = kmalloc(size, GFP_KERNEL); if (!p) return NULL; memcpy(p->name, name, len); p->name[len] = '\0'; p->len = len; p->type = d_type; p->real_ino = ino; p->ino = ino; /* Defer setting d_ino for upper entry to ovl_iterate() */ if (ovl_calc_d_ino(rdd, p)) p->ino = 0; p->is_upper = rdd->is_upper; p->is_whiteout = false; /* Defer check for overlay.whiteout to ovl_iterate() */ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p; } return p; } static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(name, len, &newp, &parent)) return true; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); return true; } static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else list_add_tail(&p->l_node, &rdd->middle); } return rdd->err == 0; } void ovl_cache_free(struct list_head *list) { struct ovl_cache_entry *p; struct ovl_cache_entry *n; list_for_each_entry_safe(p, n, list, l_node) kfree(p); INIT_LIST_HEAD(list); } void ovl_dir_cache_free(struct inode *inode) { struct ovl_dir_cache *cache = ovl_dir_cache(inode); if (cache) { ovl_cache_free(&cache->entries); kfree(cache); } } static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { if (ovl_dir_cache(inode) == cache) ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); } } static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; if (!rdd->is_lowest) return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); else return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; struct ovl_cache_entry *p; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); } } inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); return err; } static inline int ovl_dir_read(const struct path *realpath, struct ovl_readdir_data *rdd) { struct file *realfile; int err; realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE); if (IS_ERR(realfile)) return PTR_ERR(realfile); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; rdd->err = 0; err = iterate_dir(realfile, &rdd->ctx); if (err >= 0) err = rdd->err; } while (!err && rdd->count); if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath, rdd); fput(realfile); return err; } static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; struct inode *inode = file_inode(file); bool is_real; if (cache && ovl_inode_version_get(inode) != cache->version) { ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) return; od->is_real = false; } } static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = dentry, .list = list, .root = root, .is_lowest = false, }; int idx, next; const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); if (err) break; } else { /* * Insert lowest layer entries before upper ones, this * allows offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); rdd.is_lowest = true; err = ovl_dir_read(&realpath, &rdd); list_del(&rdd.middle); } } return err; } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { struct list_head *p; loff_t off = 0; list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } /* Cursor is safe since the cache is stable */ od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; struct inode *inode = d_inode(dentry); cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; } ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); cache->root = RB_ROOT; res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } /* Map inode number to lower fs unique range */ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen, bool warn) { unsigned int xinoshift = 64 - xinobits; if (unlikely(ino >> xinoshift)) { if (warn) { pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); } return ino; } /* * The lowest xinobit is reserved for mapping the non-peresistent inode * numbers range, but this range is only exposed via st_ino, not here. */ return ino | ((u64)fsid) << (xinoshift + 1); } /* * Set d_ino for upper entries if needed. Non-upper entries should always report * the uppermost real inode ino and should not call this function. * * When not all layer are on same fs, report real ino also for upper. * * When all layers are on the same fs, and upper has a reference to * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). * * Also checks the overlay.whiteout xattr by doing a full lookup which will return * negative in this case. */ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino) { struct dentry *dir = path->dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; int xinobits = ovl_xino_bits(ofs); int err = 0; if (!ovl_same_dev(ofs) && !p->check_xwhiteout) goto out; if (p->name[0] == '.') { if (p->len == 1) { this = dget(dir); goto get; } if (p->len == 2 && p->name[1] == '.') { /* we shall not be moved */ this = dget(dir->d_parent); goto get; } } /* This checks also for xwhiteouts */ this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; goto fail; } goto out; } get: if (!ovl_same_dev(ofs) || !update_ino) goto out; type = ovl_path_type(this); if (OVL_TYPE_ORIGIN(type)) { struct kstat stat; struct path statpath = *path; statpath.dentry = this; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) goto fail; /* * Directory inode is always on overlay st_dev. * Non-dir with ovl_same_dev() could be on pseudo st_dev in case * of xino bits overflow. */ WARN_ON_ONCE(S_ISDIR(stat.mode) && dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, p->name, p->len, ovl_xino_warn(ofs)); } out: p->ino = ino; dput(this); return err; fail: pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; } list_add_tail(&p->l_node, rdd->list); return true; } static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_cache_entry *p, *n; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .list = list, .root = root, }; INIT_LIST_HEAD(list); *root = RB_ROOT; ovl_path_upper(path->dentry, &realpath); err = ovl_dir_read(&realpath, &rdd); if (err) return err; list_for_each_entry_safe(p, n, list, l_node) { if (strcmp(p->name, ".") != 0 && strcmp(p->name, "..") != 0) { err = ovl_cache_update(path, p, true); if (err) return err; } if (p->ino == p->real_ino) { list_del(&p->l_node); kfree(p); } else { struct rb_node **newp = &root->rb_node; struct rb_node *parent = NULL; if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, &newp, &parent))) return -EIO; rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, root); } } return 0; } static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ ovl_dir_cache_free(inode); ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); res = ovl_dir_read_impure(path, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } if (list_empty(&cache->entries)) { /* * A good opportunity to get rid of an unneeded "impure" flag. * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { ovl_removexattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); ovl_drop_write(dentry); } ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } struct ovl_readdir_translate { struct dir_context *orig_ctx; struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; int fsid; int xinobits; bool xinowarn; }; static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_translate *rdt = container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; } else if (rdt->xinobits) { ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, name, namelen, rdt->xinowarn); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); } static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of * iterating a lower real dir, dir could be copied up and marked * impure. We only want the impure cache if we started iterating * a real upper dir to begin with. */ return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); } static int ovl_iterate_real(struct file *file, struct dir_context *ctx) { int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, .xinobits = ovl_xino_bits(ofs), .xinowarn = ovl_xino_warn(ofs), }; if (rdt.xinobits && lower_layer) rdt.fsid = lower_layer->fsid; if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; statpath.dentry = dir->d_parent; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) return err; WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); rdt.parent_ino = stat.ino; } if (ovl_is_impure_dir(file)) { rdt.cache = ovl_cache_get_impure(&file->f_path); if (IS_ERR(rdt.cache)) return PTR_ERR(rdt.cache); } err = iterate_dir(od->realfile, &rdt.ctx); ctx->pos = rdt.ctx.pos; return err; } static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_cache_entry *p; const struct cred *old_cred; int err; old_cred = ovl_override_creds(dentry->d_sb); if (!ctx->pos) ovl_dir_reset(file); if (od->is_real) { /* * If parent is merge, then need to adjust d_ino for '..', if * dir is impure then need to adjust d_ino for copied up * entries. */ if (ovl_xino_bits(ofs) || (ovl_same_fs(ofs) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { err = ovl_iterate_real(file, ctx); } else { err = iterate_dir(od->realfile, ctx); } goto out; } if (!od->cache) { struct ovl_dir_cache *cache; cache = ovl_cache_get(dentry); err = PTR_ERR(cache); if (IS_ERR(cache)) goto out; od->cache = cache; ovl_seek_cursor(od, ctx->pos); } while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); if (!p->is_whiteout) { if (!p->ino || p->check_xwhiteout) { err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) goto out; } } /* ovl_cache_update() sets is_whiteout on stale entry */ if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; } od->cursor = p->l_node.next; ctx->pos++; } err = 0; out: ovl_revert_creds(old_cred); return err; } static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) { loff_t res; struct ovl_dir_file *od = file->private_data; inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); if (od->is_real) { res = vfs_llseek(od->realfile, offset, origin); file->f_pos = od->realfile->f_pos; } else { res = -EINVAL; switch (origin) { case SEEK_CUR: offset += file->f_pos; break; case SEEK_SET: break; default: goto out_unlock; } if (offset < 0) goto out_unlock; if (offset != file->f_pos) { file->f_pos = offset; if (od->cache) ovl_seek_cursor(od, offset); } res = offset; } out_unlock: inode_unlock(file_inode(file)); return res; } static struct file *ovl_dir_open_realfile(const struct file *file, const struct path *realpath) { struct file *res; const struct cred *old_cred; old_cred = ovl_override_creds(file_inode(file)->i_sb); res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); ovl_revert_creds(old_cred); return res; } /* * Like ovl_real_fdget(), returns upperfile if dir was copied up since open. * Unlike ovl_real_fdget(), this caches upperfile in file->private_data. * * TODO: use same abstract type for file->private_data of dir and file so * upperfile could also be cached for files as well. */ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct file *old, *realfile = od->realfile; if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return want_upper ? NULL : realfile; /* * Need to check if we started out being a lower dir, but got copied up */ if (!od->is_upper) { realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_dir_open_realfile(file, &upperpath); if (IS_ERR(realfile)) return realfile; old = cmpxchg_release(&od->upperfile, NULL, realfile); if (old) { fput(realfile); realfile = old; } } } return realfile; } static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct file *realfile; int err; err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; realfile = ovl_dir_real_file(file, true); err = PTR_ERR_OR_ZERO(realfile); /* Nothing to sync for lower */ if (!realfile || err) return err; return vfs_fsync_range(realfile, start, end, datasync); } static int ovl_dir_release(struct inode *inode, struct file *file) { struct ovl_dir_file *od = file->private_data; if (od->cache) { inode_lock(inode); ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); if (od->upperfile) fput(od->upperfile); kfree(od); return 0; } static int ovl_dir_open(struct inode *inode, struct file *file) { struct path realpath; struct file *realfile; struct ovl_dir_file *od; enum ovl_path_type type; od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); if (!od) return -ENOMEM; type = ovl_path_real(file->f_path.dentry, &realpath); realfile = ovl_dir_open_realfile(file, &realpath); if (IS_ERR(realfile)) { kfree(od); return PTR_ERR(realfile); } od->realfile = realfile; od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; return 0; } WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, }; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_dir_read_merged(dentry, list, &root); ovl_revert_creds(old_cred); if (err) return err; err = 0; list_for_each_entry_safe(p, n, list, l_node) { /* * Select whiteouts in upperdir, they should * be cleared when deleting this directory. */ if (p->is_whiteout) { if (p->is_upper) continue; goto del_entry; } if (p->name[0] == '.') { if (p->len == 1) goto del_entry; if (p->len == 2 && p->name[1] == '.') goto del_entry; } err = -ENOTEMPTY; break; del_entry: list_del(&p->l_node); kfree(p); } return err; } void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; dentry = ovl_lookup_upper(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; } if (dentry->d_inode) ovl_cleanup(ofs, upper->d_inode, dentry); dput(dentry); } inode_unlock(upper->d_inode); } static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; return true; } /* * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values * if error is encountered. */ int ovl_check_d_type_supported(const struct path *realpath) { int err; struct ovl_readdir_data rdd = { .ctx.actor = ovl_check_d_type, .d_type_supported = false, }; err = ovl_dir_read(realpath, &rdd); if (err) return err; return rdd.d_type_supported; } #define OVL_INCOMPATDIR_NAME "incompat" static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, int level) { int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .list = &list, }; bool incompat = false; /* * The "work/incompat" directory is treated specially - if it is not * empty, instead of printing a generic error and mounting read-only, * we will error about incompat features and fail the mount. * * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name * starts with '#'. */ if (level == 2 && !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME)) incompat = true; err = ovl_dir_read(path, &rdd); if (err) goto out; inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { struct dentry *dentry; if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } else if (incompat) { pr_err("overlay with incompat feature '%s' cannot be mounted\n", p->name); err = -EINVAL; break; } dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level); dput(dentry); if (err) break; } inode_unlock(dir); out: ovl_cache_free(&list); return err; } int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) { return ovl_cleanup(ofs, dir, dentry); } err = ovl_do_rmdir(ofs, dir, dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; inode_unlock(dir); err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); inode_lock_nested(dir, I_MUTEX_PARENT); if (!err) err = ovl_cleanup(ofs, dir, dentry); } return err; } int ovl_indexdir_cleanup(struct ovl_fs *ofs) { int err; struct dentry *indexdir = ofs->workdir; struct dentry *index = NULL; struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .list = &list, }; err = ovl_dir_read(&path, &rdd); if (err) goto out; inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } index = ovl_lookup_upper(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; break; } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1); if (err) break; goto next; } err = ovl_verify_index(ofs, index); if (!err) { goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ err = ovl_cleanup(ofs, dir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if * an incompatible index entry was found or on out * of memory. */ break; } else if (ofs->config.nfs_export) { /* * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ err = ovl_cleanup_and_whiteout(ofs, dir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, dir, index); } if (err) break; next: dput(index); index = NULL; } dput(index); inode_unlock(dir); out: ovl_cache_free(&list); if (err) pr_err("failed index dir cleanup (%i)\n", err); return err; }
3 9 6 6 6 4 4 3 4 7 7 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_fs.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_DEFAULT, XFS_RANDOM_IFLUSH_1, XFS_RANDOM_IFLUSH_2, XFS_RANDOM_IFLUSH_3, XFS_RANDOM_IFLUSH_4, XFS_RANDOM_IFLUSH_5, XFS_RANDOM_IFLUSH_6, XFS_RANDOM_DA_READ_BUF, XFS_RANDOM_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK, XFS_RANDOM_ALLOC_READ_AGF, XFS_RANDOM_IALLOC_READ_AGI, XFS_RANDOM_ITOBP_INOTOBP, XFS_RANDOM_IUNLINK, XFS_RANDOM_IUNLINK_REMOVE, XFS_RANDOM_DIR_INO_VALIDATE, XFS_RANDOM_BULKSTAT_READ_CHUNK, XFS_RANDOM_IODONE_IOERR, XFS_RANDOM_STRATREAD_IOERR, XFS_RANDOM_STRATCMPL_IOERR, XFS_RANDOM_DIOWRITE_IOERR, XFS_RANDOM_BMAPIFORMAT, XFS_RANDOM_FREE_EXTENT, XFS_RANDOM_RMAP_FINISH_ONE, XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, XFS_RANDOM_IUNLINK_FALLBACK, XFS_RANDOM_BUF_IOERROR, XFS_RANDOM_REDUCE_MAX_IEXTENTS, XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, XFS_RANDOM_AG_RESV_FAIL, XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, XFS_RANDOM_WB_DELAY_MS, XFS_RANDOM_WRITE_DELAY_MS, XFS_RANDOM_EXCHMAPS_FINISH_ONE, XFS_RANDOM_METAFILE_RESV_CRITICAL, }; struct xfs_errortag_attr { struct attribute attr; unsigned int tag; }; static inline struct xfs_errortag_attr * to_attr(struct attribute *attr) { return container_of(attr, struct xfs_errortag_attr, attr); } static inline struct xfs_mount * to_mp(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xfs_mount, m_errortag_kobj); } STATIC ssize_t xfs_errortag_attr_store( struct kobject *kobject, struct attribute *attr, const char *buf, size_t count) { struct xfs_mount *mp = to_mp(kobject); struct xfs_errortag_attr *xfs_attr = to_attr(attr); int ret; unsigned int val; if (strcmp(buf, "default") == 0) { val = xfs_errortag_random_default[xfs_attr->tag]; } else { ret = kstrtouint(buf, 0, &val); if (ret) return ret; } ret = xfs_errortag_set(mp, xfs_attr->tag, val); if (ret) return ret; return count; } STATIC ssize_t xfs_errortag_attr_show( struct kobject *kobject, struct attribute *attr, char *buf) { struct xfs_mount *mp = to_mp(kobject); struct xfs_errortag_attr *xfs_attr = to_attr(attr); return snprintf(buf, PAGE_SIZE, "%u\n", xfs_errortag_get(mp, xfs_attr->tag)); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { .show = xfs_errortag_attr_show, .store = xfs_errortag_attr_store, }; #define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ .tag = (_tag), \ } #define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), XFS_ERRORTAG_ATTR_LIST(iflush1), XFS_ERRORTAG_ATTR_LIST(iflush2), XFS_ERRORTAG_ATTR_LIST(iflush3), XFS_ERRORTAG_ATTR_LIST(iflush4), XFS_ERRORTAG_ATTR_LIST(iflush5), XFS_ERRORTAG_ATTR_LIST(iflush6), XFS_ERRORTAG_ATTR_LIST(dareadbuf), XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), XFS_ERRORTAG_ATTR_LIST(readagf), XFS_ERRORTAG_ATTR_LIST(readagi), XFS_ERRORTAG_ATTR_LIST(itobp), XFS_ERRORTAG_ATTR_LIST(iunlink), XFS_ERRORTAG_ATTR_LIST(iunlinkrm), XFS_ERRORTAG_ATTR_LIST(dirinovalid), XFS_ERRORTAG_ATTR_LIST(bulkstat), XFS_ERRORTAG_ATTR_LIST(logiodone), XFS_ERRORTAG_ATTR_LIST(stratread), XFS_ERRORTAG_ATTR_LIST(stratcmpl), XFS_ERRORTAG_ATTR_LIST(diowrite), XFS_ERRORTAG_ATTR_LIST(bmapifmt), XFS_ERRORTAG_ATTR_LIST(free_extent), XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), XFS_ERRORTAG_ATTR_LIST(buf_ioerror), XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), XFS_ERRORTAG_ATTR_LIST(write_delay_ms), XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, .default_groups = xfs_errortag_groups, }; int xfs_errortag_init( struct xfs_mount *mp) { int ret; mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, &mp->m_kobj, "errortag"); if (ret) kfree(mp->m_errortag); return ret; } void xfs_errortag_del( struct xfs_mount *mp) { xfs_sysfs_del(&mp->m_errortag_kobj); kfree(mp->m_errortag); } static bool xfs_errortag_valid( unsigned int error_tag) { if (error_tag >= XFS_ERRTAG_MAX) return false; /* Error out removed injection types */ if (error_tag == XFS_ERRTAG_DROP_WRITES) return false; return true; } bool xfs_errortag_enabled( struct xfs_mount *mp, unsigned int tag) { if (!mp->m_errortag) return false; if (!xfs_errortag_valid(tag)) return false; return mp->m_errortag[tag] != 0; } bool xfs_errortag_test( struct xfs_mount *mp, const char *expression, const char *file, int line, unsigned int error_tag) { unsigned int randfactor; /* * To be able to use error injection anywhere, we need to ensure error * injection mechanism is already initialized. * * Code paths like I/O completion can be called before the * initialization is complete, but be able to inject errors in such * places is still useful. */ if (!mp->m_errortag) return false; if (!xfs_errortag_valid(error_tag)) return false; randfactor = mp->m_errortag[error_tag]; if (!randfactor || get_random_u32_below(randfactor)) return false; xfs_warn_ratelimited(mp, "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", expression, file, line, mp->m_super->s_id); return true; } int xfs_errortag_get( struct xfs_mount *mp, unsigned int error_tag) { if (!xfs_errortag_valid(error_tag)) return -EINVAL; return mp->m_errortag[error_tag]; } int xfs_errortag_set( struct xfs_mount *mp, unsigned int error_tag, unsigned int tag_value) { if (!xfs_errortag_valid(error_tag)) return -EINVAL; mp->m_errortag[error_tag] = tag_value; return 0; } int xfs_errortag_add( struct xfs_mount *mp, unsigned int error_tag) { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); if (!xfs_errortag_valid(error_tag)) return -EINVAL; return xfs_errortag_set(mp, error_tag, xfs_errortag_random_default[error_tag]); } int xfs_errortag_clearall( struct xfs_mount *mp) { memset(mp->m_errortag, 0, sizeof(unsigned int) * XFS_ERRTAG_MAX); return 0; } #endif /* DEBUG */ void xfs_error_report( const char *tag, int level, struct xfs_mount *mp, const char *filename, int linenum, xfs_failaddr_t failaddr) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, "Internal error %s at line %d of file %s. Caller %pS", tag, linenum, filename, failaddr); xfs_stack_trace(); } } void xfs_corruption_error( const char *tag, int level, struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr) { if (buf && level <= xfs_error_level) xfs_hex_dump(buf, bufsize); xfs_error_report(tag, level, mp, filename, linenum, failaddr); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } /* * Complain about the kinds of metadata corruption that we can't detect from a * verifier, such as incorrect inter-block relationship data. Does not set * bp->b_error. * * Call xfs_buf_mark_corrupt, not this function. */ void xfs_buf_corruption_error( struct xfs_buf *bp, xfs_failaddr_t fa) { struct xfs_mount *mp = bp->b_mount; xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, %s block 0x%llx", fa, bp->b_ops->name, xfs_buf_daddr(bp)); xfs_alert(mp, "Unmount and run xfs_repair"); if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); } /* * Warnings specifically for verifier errors. Differentiate CRC vs. invalid * values, and omit the stack trace unless the error level is tuned high. */ void xfs_buf_verifier_error( struct xfs_buf *bp, int error, const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; int sz; fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, xfs_buf_daddr(bp), name); xfs_alert(mp, "Unmount and run xfs_repair"); if (xfs_error_level >= XFS_ERRLEVEL_LOW) { sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz); xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", sz); xfs_hex_dump(buf, sz); } if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); } /* * Warnings specifically for verifier errors. Differentiate CRC vs. invalid * values, and omit the stack trace unless the error level is tuned high. */ void xfs_verifier_error( struct xfs_buf *bp, int error, xfs_failaddr_t failaddr) { return xfs_buf_verifier_error(bp, error, "", xfs_buf_offset(bp, 0), XFS_CORRUPTION_DUMP_LEN, failaddr); } /* * Warnings for inode corruption problems. Don't bother with the stack * trace unless the error level is turned up high. */ void xfs_inode_verifier_error( struct xfs_inode *ip, int error, const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t fa; int sz; fa = failaddr ? failaddr : __return_address; xfs_alert(mp, "Metadata %s detected at %pS, inode 0x%llx %s", error == -EFSBADCRC ? "CRC error" : "corruption", fa, ip->i_ino, name); xfs_alert(mp, "Unmount and run xfs_repair"); if (buf && xfs_error_level >= XFS_ERRLEVEL_LOW) { sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz); xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", sz); xfs_hex_dump(buf, sz); } if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); }
1 1 1 1 1 1 5 5 5 5 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. * Copyright (C) 2014 Marvell International Ltd. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/nfc.h> #include "nfc.h" #include "llcp.h" static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static LIST_HEAD(llcp_devices); /* Protects llcp_devices list */ static DEFINE_SPINLOCK(llcp_devices_lock); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock) { sock->remote_rw = LLCP_DEFAULT_RW; sock->remote_miu = LLCP_MAX_MIU + 1; } static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); if (local == NULL) return; /* Search for local pending SKBs that are related to this socket */ skb_queue_walk_safe(&local->tx_queue, s, tmp) { if (s->sk != &sock->sk) continue; skb_unlink(s, &local->tx_queue); kfree_skb(s); } } static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, int err) { struct sock *sk; struct hlist_node *tmp; struct nfc_llcp_sock *llcp_sock; skb_queue_purge(&local->tx_queue); write_lock(&local->sockets.lock); sk_for_each_safe(sk, tmp, &local->sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; bh_lock_sock(accept_sk); nfc_llcp_accept_unlink(accept_sk); if (err) accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); } } if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->sockets.lock); /* If we still have a device, we keep the RAW sockets alive */ if (device == true) return; write_lock(&local->raw_sockets.lock); sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->raw_sockets.lock); } static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever * we hold a reference to local, we also need to hold a reference to * the device to avoid UAF. */ if (!nfc_get_device(local->dev->idx)) return NULL; kref_get(&local->ref); return local; } static void local_cleanup(struct nfc_llcp_local *local) { nfc_llcp_socket_release(local, false, ENXIO); del_timer_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; del_timer_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); } static void local_release(struct kref *ref) { struct nfc_llcp_local *local; local = container_of(ref, struct nfc_llcp_local, ref); local_cleanup(local); kfree(local); } int nfc_llcp_local_put(struct nfc_llcp_local *local) { struct nfc_dev *dev; int ret; if (local == NULL) return 0; dev = local->dev; ret = kref_put(&local->ref, local_release); nfc_put_device(dev); return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, u8 ssap, u8 dsap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("ssap dsap %d %d\n", ssap, dsap); if (ssap == 0 && dsap == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); return llcp_sock; } static void nfc_llcp_sock_put(struct nfc_llcp_sock *sock) { sock_put(&sock->sk); } static void nfc_llcp_timeout_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, timeout_work); nfc_dep_link_down(local->dev); } static void nfc_llcp_symm_timer(struct timer_list *t) { struct nfc_llcp_local *local = from_timer(local, t, link_timer); pr_err("SYMM timeout\n"); schedule_work(&local->timeout_work); } static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) { unsigned long time; HLIST_HEAD(nl_sdres_list); struct hlist_node *n; struct nfc_llcp_sdp_tlv *sdp; struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, sdreq_timeout_work); mutex_lock(&local->sdreq_lock); time = jiffies - msecs_to_jiffies(3 * local->remote_lto); hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) { if (time_after(sdp->time, time)) continue; sdp->sap = LLCP_SDP_UNBOUND; hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); } if (!hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); mutex_unlock(&local->sdreq_lock); if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } static void nfc_llcp_sdreq_timer(struct timer_list *t) { struct nfc_llcp_local *local = from_timer(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local; struct nfc_llcp_local *res = NULL; spin_lock(&llcp_devices_lock); list_for_each_entry(local, &llcp_devices, list) if (local->dev == dev) { res = nfc_llcp_local_get(local); break; } spin_unlock(&llcp_devices_lock); return res; } static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) { struct nfc_llcp_local *local, *tmp; spin_lock(&llcp_devices_lock); list_for_each_entry_safe(local, tmp, &llcp_devices, list) if (local->dev == dev) { list_del(&local->list); spin_unlock(&llcp_devices_lock); return local; } spin_unlock(&llcp_devices_lock); pr_warn("Shutting down device not found\n"); return NULL; } static char *wks[] = { NULL, NULL, /* SDP */ "urn:nfc:sn:ip", "urn:nfc:sn:obex", "urn:nfc:sn:snep", }; static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; pr_debug("%s\n", service_name); if (service_name == NULL) return -EINVAL; num_wks = ARRAY_SIZE(wks); for (sap = 0; sap < num_wks; sap++) { if (wks[sap] == NULL) continue; if (strncmp(wks[sap], service_name, service_name_len) == 0) return sap; } return -EINVAL; } static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len, bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("sn %zd %p\n", sn_len, sn); if (sn == NULL || sn_len == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); pr_debug("llcp sock %p\n", tmp_sock); if (tmp_sock->sk.sk_type == SOCK_STREAM && tmp_sock->sk.sk_state != LLCP_LISTEN) continue; if (tmp_sock->sk.sk_type == SOCK_DGRAM && tmp_sock->sk.sk_state != LLCP_BOUND) continue; if (tmp_sock->service_name == NULL || tmp_sock->service_name_len == 0) continue; if (tmp_sock->service_name_len != sn_len) continue; if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; if (needref) sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); pr_debug("Found llcp sock %p\n", llcp_sock); return llcp_sock; } u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, struct nfc_llcp_sock *sock) { mutex_lock(&local->sdp_lock); if (sock->service_name != NULL && sock->service_name_len > 0) { int ssap = nfc_llcp_wks_sap(sock->service_name, sock->service_name_len); if (ssap > 0) { pr_debug("WKS %d\n", ssap); /* This is a WKS, let's check if it's free */ if (test_bit(ssap, &local->local_wks)) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return ssap; } /* * Check if there already is a non WKS socket bound * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, sock->service_name_len, false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } mutex_unlock(&local->sdp_lock); return LLCP_SDP_UNBOUND; } else if (sock->ssap != 0 && sock->ssap < LLCP_WKS_NUM_SAP) { if (!test_bit(sock->ssap, &local->local_wks)) { set_bit(sock->ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return sock->ssap; } } mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local) { u8 local_ssap; mutex_lock(&local->sdp_lock); local_ssap = find_first_zero_bit(&local->local_sap, LLCP_LOCAL_NUM_SAP); if (local_ssap == LLCP_LOCAL_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(local_ssap, &local->local_sap); mutex_unlock(&local->sdp_lock); return local_ssap + LLCP_LOCAL_SAP_OFFSET; } void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap) { u8 local_ssap; unsigned long *sdp; if (ssap < LLCP_WKS_NUM_SAP) { local_ssap = ssap; sdp = &local->local_wks; } else if (ssap < LLCP_LOCAL_NUM_SAP) { atomic_t *client_cnt; local_ssap = ssap - LLCP_WKS_NUM_SAP; sdp = &local->local_sdp; client_cnt = &local->local_sdp_cnt[local_ssap]; pr_debug("%d clients\n", atomic_read(client_cnt)); mutex_lock(&local->sdp_lock); if (atomic_dec_and_test(client_cnt)) { struct nfc_llcp_sock *l_sock; pr_debug("No more clients for SAP %d\n", ssap); clear_bit(local_ssap, sdp); /* Find the listening sock and set it back to UNBOUND */ l_sock = nfc_llcp_sock_get(local, ssap, LLCP_SAP_SDP); if (l_sock) { l_sock->ssap = LLCP_SDP_UNBOUND; nfc_llcp_sock_put(l_sock); } } mutex_unlock(&local->sdp_lock); return; } else if (ssap < LLCP_MAX_SAP) { local_ssap = ssap - LLCP_LOCAL_NUM_SAP; sdp = &local->local_sap; } else { return; } mutex_lock(&local->sdp_lock); clear_bit(local_ssap, sdp); mutex_unlock(&local->sdp_lock); } static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) { u8 ssap; mutex_lock(&local->sdp_lock); ssap = find_first_zero_bit(&local->local_sdp, LLCP_SDP_NUM_SAP); if (ssap == LLCP_SDP_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } pr_debug("SDP ssap %d\n", LLCP_WKS_NUM_SAP + ssap); set_bit(ssap, &local->local_sdp); mutex_unlock(&local->sdp_lock); return LLCP_WKS_NUM_SAP + ssap; } static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); if (!version_tlv) { ret = -ENOMEM; goto out; } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, &lto_length); if (!lto_tlv) { ret = -ENOMEM; goto out; } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); if (!wks_tlv) { ret = -ENOMEM; goto out; } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); if (!miux_tlv) { ret = -ENOMEM; goto out; } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); if (gb_len > NFC_MAX_GT_LEN) { ret = -EINVAL; goto out; } gb_cur = local->gb; memcpy(gb_cur, llcp_magic, ARRAY_SIZE(llcp_magic)); gb_cur += ARRAY_SIZE(llcp_magic); memcpy(gb_cur, version_tlv, version_length); gb_cur += version_length; memcpy(gb_cur, lto_tlv, lto_length); gb_cur += lto_length; memcpy(gb_cur, wks_tlv, wks_length); gb_cur += wks_length; memcpy(gb_cur, miux_tlv, miux_length); gb_cur += miux_length; local->gb_len = gb_len; out: kfree(version_tlv); kfree(lto_tlv); kfree(wks_tlv); kfree(miux_tlv); return ret; } u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { *general_bytes_len = 0; return NULL; } nfc_llcp_build_gb(local); *general_bytes_len = local->gb_len; nfc_llcp_local_put(local); return local->gb; } int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; int err; if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) return -EINVAL; local = nfc_llcp_find_local(dev); if (local == NULL) { pr_err("No LLCP device\n"); return -ENODEV; } memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); local->remote_gb_len = gb_len; if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); err = -EINVAL; goto out; } err = nfc_llcp_parse_gb_tlv(local, &local->remote_gb[3], local->remote_gb_len - 3); out: nfc_llcp_local_put(local); return err; } static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu) { pdu->data[2] = (sock->send_n << 4) | (sock->recv_n); sock->send_n = (sock->send_n + 1) % 16; sock->recv_ack_n = (sock->recv_n - 1) % 16; } void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&local->raw_sockets.lock); sk_for_each(sk, &local->raw_sockets.head) { if (sk->sk_state != LLCP_BOUND) continue; if (skb_copy == NULL) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (skb_copy == NULL) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = local->dev ? local->dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (RAW_PAYLOAD_LLCP << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&local->raw_sockets.lock); kfree_skb(skb_copy); } static void nfc_llcp_tx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, tx_work); struct sk_buff *skb; struct sock *sk; struct nfc_llcp_sock *llcp_sock; skb = skb_dequeue(&local->tx_queue); if (skb != NULL) { sk = skb->sk; llcp_sock = nfc_llcp_sock(sk); if (llcp_sock == NULL && nfc_llcp_ptype(skb) == LLCP_PDU_I) { kfree_skb(skb); nfc_llcp_send_symm(local->dev); } else if (llcp_sock && !llcp_sock->remote_ready) { skb_queue_head(&local->tx_queue, skb); nfc_llcp_send_symm(local->dev); } else { struct sk_buff *copy_skb = NULL; u8 ptype = nfc_llcp_ptype(skb); int ret; pr_debug("Sending pending skb\n"); print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); ret = nfc_data_exchange(local->dev, local->target_idx, skb, nfc_llcp_recv, local); if (ret) { kfree_skb(copy_skb); goto out; } if (ptype == LLCP_PDU_I && copy_skb) skb_queue_tail(&llcp_sock->tx_pending_queue, copy_skb); } } else { nfc_llcp_send_symm(local->dev); } out: mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(2 * local->remote_lto)); } static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local *local, u8 ssap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; read_lock(&local->connecting_sockets.lock); sk_for_each(sk, &local->connecting_sockets.head) { llcp_sock = nfc_llcp_sock(sk); if (llcp_sock->ssap == ssap) { sock_hold(&llcp_sock->sk); goto out; } } llcp_sock = NULL; out: read_unlock(&local->connecting_sockets.lock); return llcp_sock; } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len) { return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { u8 type, length; const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); if (type == LLCP_TLV_SN) { *sn_len = length; return &tlv[2]; } offset += length + 2; tlv += length + 2; } return NULL; } static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct nfc_llcp_ui_cb *ui_cb; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ui_cb = nfc_llcp_ui_skb_cb(skb); ui_cb->dsap = dsap; ui_cb->ssap = ssap; pr_debug("%d %d\n", dsap, ssap); /* We're looking for a bound socket, not a client one */ llcp_sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (llcp_sock == NULL || llcp_sock->sk.sk_type != SOCK_DGRAM) return; /* There is no sequence with UI frames */ skb_pull(skb, LLCP_HEADER_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * UI frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP) { sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (sock == NULL || sock->sk.sk_state != LLCP_LISTEN) { reason = LLCP_DM_NOBOUND; goto fail; } } else { const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); if (sn == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } pr_debug("Service name length %zu\n", sn_len); sock = nfc_llcp_sock_get_sn(local, sn, sn_len); if (sock == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } } lock_sock(&sock->sk); parent = &sock->sk; if (sk_acceptq_is_full(parent)) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } if (sock->ssap == LLCP_SDP_UNBOUND) { u8 ssap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("First client, reserving %d\n", ssap); if (ssap == LLCP_SAP_MAX) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } sock->ssap = ssap; } new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock = nfc_llcp_sock(new_sk); new_sock->local = nfc_llcp_local_get(local); if (!new_sock->local) { reason = LLCP_DM_REJ; sock_put(&new_sock->sk); release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; new_sock->parent = parent; new_sock->ssap = sock->ssap; if (sock->ssap < LLCP_LOCAL_NUM_SAP && sock->ssap >= LLCP_WKS_NUM_SAP) { atomic_t *client_count; pr_debug("reserved_ssap %d for %p\n", sock->ssap, new_sock); client_count = &local->local_sdp_cnt[sock->ssap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); new_sock->reserved_ssap = sock->ssap; } nfc_llcp_parse_connection_tlv(new_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); pr_debug("new sock %p sk %p\n", new_sock, &new_sock->sk); nfc_llcp_sock_link(&local->sockets, new_sk); nfc_llcp_accept_enqueue(&sock->sk, new_sk); nfc_get_device(local->dev->idx); new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); release_sock(&sock->sk); sock_put(&sock->sk); return; fail: /* Send DM */ nfc_llcp_send_dm(local, dsap, ssap, reason); } int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock) { int nr_frames = 0; struct nfc_llcp_local *local = sock->local; pr_debug("Remote ready %d tx queue len %d remote rw %d", sock->remote_ready, skb_queue_len(&sock->tx_pending_queue), sock->remote_rw); /* Try to queue some I frames for transmission */ while (sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) { struct sk_buff *pdu; pdu = skb_dequeue(&sock->tx_queue); if (pdu == NULL) break; /* Update N(S)/N(R) */ nfc_llcp_set_nrns(sock, pdu); skb_queue_tail(&local->tx_queue, pdu); nr_frames++; } return nr_frames; } static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, ptype, ns, nr; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ns = nfc_llcp_ns(skb); nr = nfc_llcp_nr(skb); pr_debug("%d %d R %d S %d\n", dsap, ssap, nr, ns); llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } /* Pass the payload upstream */ if (ptype == LLCP_PDU_I) { pr_debug("I frame, queueing on %p\n", &llcp_sock->sk); if (ns == llcp_sock->recv_n) llcp_sock->recv_n = (llcp_sock->recv_n + 1) % 16; else pr_err("Received out of sequence I PDU\n"); skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * I frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } } /* Remove skbs from the pending queue */ if (llcp_sock->send_ack_n != nr) { struct sk_buff *s, *tmp; u8 n; llcp_sock->send_ack_n = nr; /* Remove and free all skbs until ns == nr */ skb_queue_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { n = nfc_llcp_ns(s); skb_unlink(s, &llcp_sock->tx_pending_queue); kfree_skb(s); if (n == nr) break; } /* Re-queue the remaining skbs for transmission */ skb_queue_reverse_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { skb_unlink(s, &llcp_sock->tx_pending_queue); skb_queue_head(&local->tx_queue, s); } } if (ptype == LLCP_PDU_RR) llcp_sock->remote_ready = true; else if (ptype == LLCP_PDU_RNR) llcp_sock->remote_ready = false; if (nfc_llcp_queue_i_frames(llcp_sock) == 0 && ptype == LLCP_PDU_I) nfc_llcp_send_rr(llcp_sock); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); if ((dsap == 0) && (ssap == 0)) { pr_debug("Connection termination"); nfc_dep_link_down(local->dev); return; } llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } if (sk->sk_state == LLCP_CONNECTED) { nfc_put_device(local->dev); sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); } nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_DISC); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); if (llcp_sock == NULL) { pr_err("Invalid CC\n"); nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); llcp_sock->dsap = ssap; nfc_llcp_parse_connection_tlv(llcp_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); reason = skb->data[2]; pr_debug("%d %d reason %d\n", ssap, dsap, reason); switch (reason) { case LLCP_DM_NOBOUND: case LLCP_DM_REJ: llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); break; default: llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); break; } if (llcp_sock == NULL) { pr_debug("Already closed\n"); return; } sk = &llcp_sock->sk; sk->sk_err = ENXIO; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; u8 dsap, ssap, type, length, tid, sap; const u8 *tlv; u16 tlv_len, offset; const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); size_t sdres_tlvs_len; HLIST_HEAD(nl_sdres_list); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP || ssap != LLCP_SAP_SDP) { pr_err("Wrong SNL SAP\n"); return; } tlv = &skb->data[LLCP_HEADER_SIZE]; tlv_len = skb->len - LLCP_HEADER_SIZE; offset = 0; sdres_tlvs_len = 0; while (offset < tlv_len) { type = tlv[0]; length = tlv[1]; switch (type) { case LLCP_TLV_SDREQ: tid = tlv[2]; service_name = (char *) &tlv[3]; service_name_len = length - 1; pr_debug("Looking for %.16s\n", service_name); if (service_name_len == strlen("urn:nfc:sn:sdp") && !strncmp(service_name, "urn:nfc:sn:sdp", service_name_len)) { sap = 1; goto add_snl; } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, service_name_len, true); if (!llcp_sock) { sap = 0; goto add_snl; } /* * We found a socket but its ssap has not been reserved * yet. We need to assign it for good and send a reply. * The ssap will be freed when the socket is closed. */ if (llcp_sock->ssap == LLCP_SDP_UNBOUND) { atomic_t *client_count; sap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("Reserving %d\n", sap); if (sap == LLCP_SAP_MAX) { sap = 0; nfc_llcp_sock_put(llcp_sock); goto add_snl; } client_count = &local->local_sdp_cnt[sap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); llcp_sock->ssap = sap; llcp_sock->reserved_ssap = sap; } else { sap = llcp_sock->ssap; } pr_debug("%p %d\n", llcp_sock, sap); nfc_llcp_sock_put(llcp_sock); add_snl: sdp = nfc_llcp_build_sdres_tlv(tid, sap); if (sdp == NULL) goto exit; sdres_tlvs_len += sdp->tlv_len; hlist_add_head(&sdp->node, &llc_sdres_list); break; case LLCP_TLV_SDRES: mutex_lock(&local->sdreq_lock); pr_debug("LLCP_TLV_SDRES: searching tid %d\n", tlv[2]); hlist_for_each_entry(sdp, &local->pending_sdreqs, node) { if (sdp->tid != tlv[2]) continue; sdp->sap = tlv[3]; pr_debug("Found: uri=%s, sap=%d\n", sdp->uri, sdp->sap); hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); break; } mutex_unlock(&local->sdreq_lock); break; default: pr_err("Invalid SNL tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } exit: if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); if (!hlist_empty(&llc_sdres_list)) nfc_llcp_send_snl_sdres(local, &llc_sdres_list, sdres_tlvs_len); } static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 ptype; u16 pdu_len; struct sk_buff *new_skb; if (skb->len <= LLCP_HEADER_SIZE) { pr_err("Malformed AGF PDU\n"); return; } skb_pull(skb, LLCP_HEADER_SIZE); while (skb->len > LLCP_AGF_PDU_HEADER_SIZE) { pdu_len = skb->data[0] << 8 | skb->data[1]; skb_pull(skb, LLCP_AGF_PDU_HEADER_SIZE); if (pdu_len < LLCP_HEADER_SIZE || pdu_len > skb->len) { pr_err("Malformed AGF PDU\n"); return; } ptype = nfc_llcp_ptype(skb); if (ptype == LLCP_PDU_SYMM || ptype == LLCP_PDU_AGF) goto next; new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL); if (new_skb == NULL) { pr_err("Could not allocate PDU\n"); return; } skb_put_data(new_skb, skb->data, pdu_len); nfc_llcp_rx_skb(local, new_skb); kfree_skb(new_skb); next: skb_pull(skb, pdu_len); } } static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 dsap, ssap, ptype; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); if (ptype != LLCP_PDU_SYMM) print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); switch (ptype) { case LLCP_PDU_SYMM: pr_debug("SYMM\n"); break; case LLCP_PDU_UI: pr_debug("UI\n"); nfc_llcp_recv_ui(local, skb); break; case LLCP_PDU_CONNECT: pr_debug("CONNECT\n"); nfc_llcp_recv_connect(local, skb); break; case LLCP_PDU_DISC: pr_debug("DISC\n"); nfc_llcp_recv_disc(local, skb); break; case LLCP_PDU_CC: pr_debug("CC\n"); nfc_llcp_recv_cc(local, skb); break; case LLCP_PDU_DM: pr_debug("DM\n"); nfc_llcp_recv_dm(local, skb); break; case LLCP_PDU_SNL: pr_debug("SNL\n"); nfc_llcp_recv_snl(local, skb); break; case LLCP_PDU_I: case LLCP_PDU_RR: case LLCP_PDU_RNR: pr_debug("I frame\n"); nfc_llcp_recv_hdlc(local, skb); break; case LLCP_PDU_AGF: pr_debug("AGF frame\n"); nfc_llcp_recv_agf(local, skb); break; } } static void nfc_llcp_rx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, rx_work); struct sk_buff *skb; skb = local->rx_pending; if (skb == NULL) { pr_debug("No pending SKB\n"); return; } __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_RX); nfc_llcp_rx_skb(local, skb); schedule_work(&local->tx_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; } static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb) { local->rx_pending = skb; del_timer(&local->link_timer); schedule_work(&local->rx_work); } void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; if (err < 0) { pr_err("LLCP PDU receive err %d\n", err); return; } __nfc_llcp_recv(local, skb); } int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { kfree_skb(skb); return -ENODEV; } __nfc_llcp_recv(local, skb); nfc_llcp_local_put(local); return 0; } void nfc_llcp_mac_is_down(struct nfc_dev *dev) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) return; local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; /* Close and purge all existing sockets */ nfc_llcp_socket_release(local, true, 0); nfc_llcp_local_put(local); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct nfc_llcp_local *local; pr_debug("rf mode %d\n", rf_mode); local = nfc_llcp_find_local(dev); if (local == NULL) return; local->target_idx = target_idx; local->comm_mode = comm_mode; local->rf_mode = rf_mode; if (rf_mode == NFC_RF_INITIATOR) { pr_debug("Queueing Tx work\n"); schedule_work(&local->tx_work); } else { mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(local->remote_lto)); } nfc_llcp_local_put(local); } int nfc_llcp_register_device(struct nfc_dev *ndev) { struct nfc_llcp_local *local; local = kzalloc(sizeof(struct nfc_llcp_local), GFP_KERNEL); if (local == NULL) return -ENOMEM; /* As we are going to initialize local's refcount, we need to get the * nfc_dev to avoid UAF, otherwise there is no point in continuing. * See nfc_llcp_local_get(). */ local->dev = nfc_get_device(ndev->idx); if (!local->dev) { kfree(local); return -ENODEV; } INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); timer_setup(&local->link_timer, nfc_llcp_symm_timer, 0); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); local->rx_pending = NULL; INIT_WORK(&local->rx_work, nfc_llcp_rx_work); INIT_WORK(&local->timeout_work, nfc_llcp_timeout_work); rwlock_init(&local->sockets.lock); rwlock_init(&local->connecting_sockets.lock); rwlock_init(&local->raw_sockets.lock); local->lto = 150; /* 1500 ms */ local->rw = LLCP_MAX_RW; local->miux = cpu_to_be16(LLCP_MAX_MIUX); local->local_wks = 0x1; /* LLC Link Management */ nfc_llcp_build_gb(local); local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); spin_unlock(&llcp_devices_lock); return 0; } void nfc_llcp_unregister_device(struct nfc_dev *dev) { struct nfc_llcp_local *local = nfc_llcp_remove_local(dev); if (local == NULL) { pr_debug("No such device\n"); return; } local_cleanup(local); nfc_llcp_local_put(local); } int __init nfc_llcp_init(void) { return nfc_llcp_sock_init(); } void nfc_llcp_exit(void) { nfc_llcp_sock_exit(); }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/eventfd.h> #include <linux/eventpoll.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include "io-wq.h" #include "eventfd.h" struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async; /* protected by ->completion_lock */ unsigned last_cq_tail; refcount_t refs; atomic_t ops; struct rcu_head rcu; }; enum { IO_EVENTFD_OP_SIGNAL_BIT, }; static void io_eventfd_free(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); eventfd_ctx_put(ev_fd->cq_ev_fd); kfree(ev_fd); } static void io_eventfd_put(struct io_ev_fd *ev_fd) { if (refcount_dec_and_test(&ev_fd->refs)) call_rcu(&ev_fd->rcu, io_eventfd_free); } static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); io_eventfd_put(ev_fd); } static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) { if (put_ref) io_eventfd_put(ev_fd); rcu_read_unlock(); } /* * Returns true if the caller should put the ev_fd reference, false if not. */ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) { if (eventfd_signal_allowed()) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); return true; } if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); return false; } return true; } /* * Trigger if eventfd_async isn't set, or if it's set and the caller is * an async worker. If ev_fd isn't valid, obviously return false. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { if (ev_fd) return !ev_fd->eventfd_async || io_wq_current_is_worker(); return false; } /* * On success, returns with an ev_fd reference grabbed and the RCU read * lock held. */ static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) return NULL; rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking * and eventfd_signal */ ev_fd = rcu_dereference(ctx->io_ev_fd); /* * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) return ev_fd; rcu_read_unlock(); return NULL; } void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; ev_fd = io_eventfd_grab(ctx); if (ev_fd) io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); } void io_eventfd_flush_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; ev_fd = io_eventfd_grab(ctx); if (ev_fd) { bool skip, put_ref = true; /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd * notification count only changing IFF a new CQE has been * added to the CQ ring. There's no dependency on 1:1 * relationship between how many times this function is called * (and hence the eventfd count) and number of CQEs posted to * the CQ ring. */ spin_lock(&ctx->completion_lock); skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); if (!skip) put_ref = __io_eventfd_signal(ev_fd); io_eventfd_release(ev_fd, put_ref); } } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async) { struct io_ev_fd *ev_fd; __s32 __user *fds = arg; int fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) return -EBUSY; if (copy_from_user(&fd, fds, sizeof(*fds))) return -EFAULT; ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); if (!ev_fd) return -ENOMEM; ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ev_fd->cq_ev_fd)) { int ret = PTR_ERR(ev_fd->cq_ev_fd); kfree(ev_fd); return ret; } spin_lock(&ctx->completion_lock); ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } int io_eventfd_unregister(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; } return -ENXIO; }
2 1 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; rc = -EINVAL; if (len < 0) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25);
2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge netlink control interface * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/if_bridge.h> #include "br_private.h" #include "br_private_stp.h" #include "br_private_cfm.h" #include "br_private_tunnel.h" #include "br_private_mcast_eht.h" static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int num_vlans = 0; if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; pvid = br_get_pvid(vg); /* Count number of vlan infos */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; /* only a context, bridge vlan not activated */ if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { if ((vid_range_end - vid_range_start) > 0) num_vlans += 2; else num_vlans += 1; } return num_vlans; } static int br_get_num_vlan_infos(struct net_bridge_vlan_group *vg, u32 filter_mask) { int num_vlans; if (!vg) return 0; if (filter_mask & RTEXT_FILTER_BRVLAN) return vg->num_vlans; rcu_read_lock(); num_vlans = __get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); return num_vlans; } static size_t br_get_link_af_size_filtered(const struct net_device *dev, u32 filter_mask) { struct net_bridge_vlan_group *vg = NULL; struct net_bridge_port *p = NULL; struct net_bridge *br = NULL; u32 num_cfm_peer_mep_infos; u32 num_cfm_mep_infos; size_t vinfo_sz = 0; int num_vlan_infos; rcu_read_lock(); if (netif_is_bridge_port(dev)) { p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); } else if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); } num_vlan_infos = br_get_num_vlan_infos(vg, filter_mask); rcu_read_unlock(); if (p && (p->flags & BR_VLAN_TUNNEL)) vinfo_sz += br_get_vlan_tunnel_info_size(vg); /* Each VLAN is returned in bridge_vlan_info along with flags */ vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); if (p && vg && (filter_mask & RTEXT_FILTER_MST)) vinfo_sz += br_mst_info_size(vg); if (!(filter_mask & RTEXT_FILTER_CFM_STATUS)) return vinfo_sz; if (!br) return vinfo_sz; /* CFM status info must be added */ br_cfm_mep_count(br, &num_cfm_mep_infos); br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos); vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */ /* For each status struct the MEP instance (u32) is added */ /* MEP instance (u32) + br_cfm_mep_status */ vinfo_sz += num_cfm_mep_infos * /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */ + nla_total_size(sizeof(u32))); /* MEP instance (u32) + br_cfm_cc_peer_status */ vinfo_sz += num_cfm_peer_mep_infos * /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */ (nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */ + nla_total_size(sizeof(u8)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */ + nla_total_size(sizeof(u32))); return vinfo_sz; } static inline size_t br_port_info_size(void) { return nla_total_size(1) /* IFLA_BRPORT_STATE */ + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */ + nla_total_size(4) /* IFLA_BRPORT_COST */ + nla_total_size(1) /* IFLA_BRPORT_MODE */ + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_MCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_BCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(1) /* IFLA_BRPORT_MAB */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_COST */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + nla_total_size(br_get_link_af_size_filtered(dev, filter_mask)) /* IFLA_AF_SPEC */ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */ } static int br_port_fill_attrs(struct sk_buff *skb, const struct net_bridge_port *p) { u8 mode = !!(p->flags & BR_HAIRPIN_MODE); struct net_bridge_port *backup_p; u64 timerval; if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) || nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) || nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_TO_UCAST, !!(p->flags & BR_MULTICAST_TO_UNICAST)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || nla_put(skb, IFLA_BRPORT_ROOT_ID, sizeof(struct ifla_bridge_id), &p->designated_root) || nla_put(skb, IFLA_BRPORT_BRIDGE_ID, sizeof(struct ifla_bridge_id), &p->designated_bridge) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_PORT, p->designated_port) || nla_put_u16(skb, IFLA_BRPORT_DESIGNATED_COST, p->designated_cost) || nla_put_u16(skb, IFLA_BRPORT_ID, p->port_id) || nla_put_u16(skb, IFLA_BRPORT_NO, p->port_no) || nla_put_u8(skb, IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & BR_VLAN_TUNNEL)) || nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) || nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, !!(p->flags & BR_NEIGH_VLAN_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->forward_delay_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->hold_timer); if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval, IFLA_BRPORT_PAD)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, p->multicast_eht_hosts_cnt) || nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS, br_multicast_ngroups_get(&p->multicast_ctx)) || nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS, br_multicast_ngroups_get_max(&p->multicast_ctx))) return -EMSGSIZE; #endif /* we might be called only with br->lock */ rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, backup_p->dev->ifindex); rcu_read_unlock(); if (p->backup_nhid && nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) return -EMSGSIZE; return 0; } static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start, u16 vid_end, u16 flags) { struct bridge_vlan_info vinfo; if ((vid_end - vid_start) > 0) { /* add range to skb */ vinfo.vid = vid_start; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_BEGIN; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; vinfo.vid = vid_end; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } else { vinfo.vid = vid_start; vinfo.flags = flags; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *v; u16 vid_range_start = 0, vid_range_end = 0, vid_range_flags = 0; u16 flags, pvid; int err = 0; /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan * and mark vlan info with begin and end flags * if vlaninfo represents a range */ pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { flags = 0; if (!br_vlan_should_use(v)) continue; if (v->vid == pvid) flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (vid_range_start == 0) { goto initvars; } else if ((v->vid - vid_range_end) == 1 && flags == vid_range_flags) { vid_range_end = v->vid; continue; } else { err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } initvars: vid_range_start = v->vid; vid_range_end = v->vid; vid_range_flags = flags; } if (vid_range_start != 0) { /* Call it once more to send any left over vlans */ err = br_fill_ifvlaninfo_range(skb, vid_range_start, vid_range_end, vid_range_flags); if (err) return err; } return 0; } static int br_fill_ifvlaninfo(struct sk_buff *skb, struct net_bridge_vlan_group *vg) { struct bridge_vlan_info vinfo; struct net_bridge_vlan *v; u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; vinfo.vid = v->vid; vinfo.flags = 0; if (v->vid == pvid) vinfo.flags |= BRIDGE_VLAN_INFO_PVID; if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, sizeof(vinfo), &vinfo)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } /* * Create one netlink message for one interface * Contains port and master info as well as carrier and bridge state. */ static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, u32 pid, u32 seq, int event, unsigned int flags, u32 filter_mask, const struct net_device *dev, bool getlink) { u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; struct nlmsghdr *nlh; if (port) br = port->br; else br = netdev_priv(dev); br_debug(br, "br_fill_info event %d port %s master %s\n", event, dev->name, br->dev->name); nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); if (nlh == NULL) return -EMSGSIZE; hdr = nlmsg_data(nlh); hdr->ifi_family = AF_BRIDGE; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; hdr->ifi_index = dev->ifindex; hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; if (event == RTM_NEWLINK && port) { struct nlattr *nest; nest = nla_nest_start(skb, IFLA_PROTINFO); if (nest == NULL || br_port_fill_attrs(skb, port) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } if (filter_mask & (RTEXT_FILTER_BRVLAN | RTEXT_FILTER_BRVLAN_COMPRESSED | RTEXT_FILTER_MRP | RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS | RTEXT_FILTER_MST)) { af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) goto nla_put_failure; } /* Check if the VID information is requested */ if ((filter_mask & RTEXT_FILTER_BRVLAN) || (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { struct net_bridge_vlan_group *vg; int err; /* RCU needed because of the VLAN locking rules (rcu || rtnl) */ rcu_read_lock(); if (port) vg = nbp_vlan_group_rcu(port); else vg = br_vlan_group_rcu(br); if (!vg || !vg->num_vlans) { rcu_read_unlock(); goto done; } if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) err = br_fill_ifvlaninfo_compressed(skb, vg); else err = br_fill_ifvlaninfo(skb, vg); if (port && (port->flags & BR_VLAN_TUNNEL)) err = br_fill_vlan_tunnel_info(skb, vg); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_MRP) { int err; if (!br_mrp_enabled(br) || port) goto done; rcu_read_lock(); err = br_mrp_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) { struct nlattr *cfm_nest = NULL; int err; if (!br_cfm_created(br) || port) goto done; cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM); if (!cfm_nest) goto nla_put_failure; if (filter_mask & RTEXT_FILTER_CFM_CONFIG) { rcu_read_lock(); err = br_cfm_config_fill_info(skb, br); rcu_read_unlock(); if (err) goto nla_put_failure; } if (filter_mask & RTEXT_FILTER_CFM_STATUS) { rcu_read_lock(); err = br_cfm_status_fill_info(skb, br, getlink); rcu_read_unlock(); if (err) goto nla_put_failure; } nla_nest_end(skb, cfm_nest); } if ((filter_mask & RTEXT_FILTER_MST) && br_opt_get(br, BROPT_MST_ENABLED) && port) { const struct net_bridge_vlan_group *vg = nbp_vlan_group(port); struct nlattr *mst_nest; int err; if (!vg || !vg->num_vlans) goto done; mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST); if (!mst_nest) goto nla_put_failure; err = br_mst_fill_info(skb, vg); if (err) goto nla_put_failure; nla_nest_end(skb, mst_nest); } done: if (af) { if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0)) nla_nest_end(skb, af); else nla_nest_cancel(skb, af); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void br_info_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port, u32 filter) { struct net_device *dev; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 port_no = 0; if (WARN_ON(!port && !br)) return; if (port) { dev = port->dev; br = port->br; port_no = port->port_no; } else { dev = br->dev; } net = dev_net(dev); br_debug(br, "port %u(%s) event %d\n", port_no, dev->name, event); skb = nlmsg_new(br_nlmsg_size(dev, filter), GFP_ATOMIC); if (skb == NULL) goto errout; err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_LINK, err); } /* Notify listeners of a change in bridge or port information */ void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; br_info_notify(event, br, port, filter); } /* * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags) { struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && !(filter_mask & RTEXT_FILTER_MRP) && !(filter_mask & RTEXT_FILTER_CFM_CONFIG) && !(filter_mask & RTEXT_FILTER_CFM_STATUS)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, filter_mask, dev, true); } static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo, bool *changed, struct netlink_ext_ack *extack) { bool curr_change; int err = 0; switch (cmd) { case RTM_SETLINK: if (p) { /* if the MASTER flag is set this will act on the global * per-VLAN entry as well */ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags, &curr_change, extack); } else { vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY; err = br_vlan_add(br, vinfo->vid, vinfo->flags, &curr_change, extack); } if (curr_change) *changed = true; break; case RTM_DELLINK: if (p) { if (!nbp_vlan_delete(p, vinfo->vid)) *changed = true; if ((vinfo->flags & BRIDGE_VLAN_INFO_MASTER) && !br_vlan_delete(p->br, vinfo->vid)) *changed = true; } else if (!br_vlan_delete(br, vinfo->vid)) { *changed = true; } break; } return err; } int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, bool *changed, struct netlink_ext_ack *extack) { int err, rtm_cmd; if (!br_vlan_valid_id(vinfo_curr->vid, extack)) return -EINVAL; /* needed for vlan-only NEWVLAN/DELVLAN notifications */ rtm_cmd = br_afspec_cmd_to_rtm(cmd); if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; *vinfo_last = vinfo_curr; return 0; } if (*vinfo_last) { struct bridge_vlan_info tmp_vinfo; int v, v_change_start = 0; if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack)) return -EINVAL; memcpy(&tmp_vinfo, *vinfo_last, sizeof(struct bridge_vlan_info)); for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) { bool curr_change = false; tmp_vinfo.vid = v; err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change, extack); if (err) break; if (curr_change) { *changed = curr_change; if (!v_change_start) v_change_start = v; } else { /* nothing to notify yet */ if (!v_change_start) continue; br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); v_change_start = 0; } cond_resched(); } /* v_change_start is set only if the last/whole range changed */ if (v_change_start) br_vlan_notify(br, p, v_change_start, v - 1, rtm_cmd); *vinfo_last = NULL; return err; } err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack); if (*changed) br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd); return err; } static int br_afspec(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *af_spec, int cmd, bool *changed, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo_curr = NULL; struct bridge_vlan_info *vinfo_last = NULL; struct nlattr *attr; struct vtunnel_info tinfo_last = {}; struct vtunnel_info tinfo_curr = {}; int err = 0, rem; nla_for_each_nested(attr, af_spec, rem) { err = 0; switch (nla_type(attr)) { case IFLA_BRIDGE_VLAN_TUNNEL_INFO: if (!p || !(p->flags & BR_VLAN_TUNNEL)) return -EINVAL; err = br_parse_vlan_tunnel_info(attr, &tinfo_curr); if (err) return err; err = br_process_vlan_tunnel_info(br, p, cmd, &tinfo_curr, &tinfo_last, changed); if (err) return err; break; case IFLA_BRIDGE_VLAN_INFO: if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo_curr = nla_data(attr); err = br_process_vlan_info(br, p, cmd, vinfo_curr, &vinfo_last, changed, extack); if (err) return err; break; case IFLA_BRIDGE_MRP: err = br_mrp_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_CFM: err = br_cfm_parse(br, p, attr, cmd, extack); if (err) return err; break; case IFLA_BRIDGE_MST: if (!p) { NL_SET_ERR_MSG(extack, "MST states can only be set on bridge ports"); return -EINVAL; } if (cmd != RTM_SETLINK) { NL_SET_ERR_MSG(extack, "MST states can only be set through RTM_SETLINK"); return -EINVAL; } err = br_mst_process(p, attr, extack); if (err) return err; break; } } return err; } static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_UNSPEC] = { .strict_start_type = IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 }, [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP] = { .type = NLA_U8 }, [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 }, [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ static int br_set_port_state(struct net_bridge_port *p, u8 state) { if (state > BR_STATE_BLOCKING) return -EINVAL; /* if kernel STP is running, don't allow changes */ if (p->br->stp_enabled == BR_KERNEL_STP) return -EBUSY; /* if device is not up, change is not allowed * if link is not present, only allowable state is disabled */ if (!netif_running(p->dev) || (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; br_set_state(p, state); br_port_state_selection(p->br); return 0; } /* Set/clear or port flags based on attribute */ static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[], int attrtype, unsigned long mask) { if (!tb[attrtype]) return; if (nla_get_u8(tb[attrtype])) p->flags |= mask; else p->flags &= ~mask; } /* Process bridge protocol info on port */ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned long old_flags, changed_mask; bool br_vlan_tunnel_old; int err; old_flags = p->flags; br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false; br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE); br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, BR_NEIGH_VLAN_SUPPRESS); if ((p->flags & BR_PORT_MAB) && (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) { NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled"); p->flags = old_flags; return -EINVAL; } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) { struct net_bridge_fdb_flush_desc desc = { .flags = BIT(BR_FDB_LOCKED), .flags_mask = BIT(BR_FDB_LOCKED), .port_ifindex = p->dev->ifindex, }; br_fdb_flush(p->br, &desc); } changed_mask = old_flags ^ p->flags; err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack); if (err) { p->flags = old_flags; return err; } if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL)) nbp_vlan_tunnel_info_flush(p); br_port_flags_change(p, changed_mask); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); if (err) return err; } if (tb[IFLA_BRPORT_PRIORITY]) { err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); if (err) return err; } if (tb[IFLA_BRPORT_STATE]) { err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); if (err) return err; } if (tb[IFLA_BRPORT_FLUSH]) br_fdb_delete_by_port(p->br, p, 0, 0); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); err = br_multicast_set_port_router(&p->multicast_ctx, mcast_router); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) { u32 hlimit; hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]); err = br_multicast_eht_set_hosts_limit(p, hlimit); if (err) return err; } if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) { u32 max_groups; max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]); br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups); } #endif if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_MACPAUSE) return -EINVAL; p->group_fwd_mask = fwd_mask; } if (tb[IFLA_BRPORT_BACKUP_PORT]) { struct net_device *backup_dev = NULL; u32 backup_ifindex; backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]); if (backup_ifindex) { backup_dev = __dev_get_by_index(dev_net(p->dev), backup_ifindex); if (!backup_dev) return -ENOENT; } err = nbp_backup_change(p, backup_dev); if (err) return err; } if (tb[IFLA_BRPORT_BACKUP_NHID]) { u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); WRITE_ONCE(p->backup_nhid, backup_nhid); } return 0; } /* Change state and parameters on port. */ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct nlattr *tb[IFLA_BRPORT_MAX + 1]; struct net_bridge_port *p; struct nlattr *protinfo; struct nlattr *afspec; bool changed = false; int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!protinfo && !afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself if the AF_SPEC * is set to see if someone is setting vlan info on the bridge */ if (!p && !afspec) return -EINVAL; if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX, protinfo, br_port_policy, NULL); if (err) return err; spin_lock_bh(&p->br->lock); err = br_setport(p, tb, extack); spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) return -EINVAL; spin_lock_bh(&p->br->lock); err = br_set_port_state(p, nla_get_u8(protinfo)); spin_unlock_bh(&p->br->lock); } if (err) goto out; changed = true; } if (afspec) err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack); if (changed) br_ifinfo_notify(RTM_NEWLINK, br, p); out: return err; } /* Delete port information */ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct net_bridge *br = (struct net_bridge *)netdev_priv(dev); struct net_bridge_port *p; struct nlattr *afspec; bool changed = false; int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) return 0; p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself as well */ if (!p && !netif_is_bridge_master(dev)) return -EINVAL; err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); if (changed) /* Send RTM_NEWLINK because userspace * expects RTM_NEWLINK for vlan dels */ br_ifinfo_notify(RTM_NEWLINK, br, p); return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL] && !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]))) return -EPROTONOSUPPORT; if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); if (defpvid >= VLAN_VID_MASK) return -EINVAL; } #endif return 0; } static int br_port_slave_changelink(struct net_device *brdev, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int ret; if (!data) return 0; spin_lock_bh(&br->lock); ret = br_setport(br_port_get_rtnl(dev), data, extack); spin_unlock_bh(&br->lock); return ret; } static int br_port_fill_slave_info(struct sk_buff *skb, const struct net_device *brdev, const struct net_device *dev) { return br_port_fill_attrs(skb, br_port_get_rtnl(dev)); } static size_t br_port_get_slave_size(const struct net_device *brdev, const struct net_device *dev) { return br_port_info_size(); } static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_UNSPEC] = { .strict_start_type = IFLA_BR_FDB_N_LEARNED }, [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, [IFLA_BR_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BR_GROUP_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IFLA_BR_MCAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_SNOOPING] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERY_USE_IFADDR] = { .type = NLA_U8 }, [IFLA_BR_MCAST_QUERIER] = { .type = NLA_U8 }, [IFLA_BR_MCAST_HASH_ELASTICITY] = { .type = NLA_U32 }, [IFLA_BR_MCAST_HASH_MAX] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, [IFLA_BR_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [IFLA_BR_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [IFLA_BR_NF_CALL_IPTABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, [IFLA_BR_MULTI_BOOLOPT] = NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)), [IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT }, [IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(brdev); int err; if (!data) return 0; if (data[IFLA_BR_FORWARD_DELAY]) { err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY])); if (err) return err; } if (data[IFLA_BR_HELLO_TIME]) { err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME])); if (err) return err; } if (data[IFLA_BR_MAX_AGE]) { err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE])); if (err) return err; } if (data[IFLA_BR_AGEING_TIME]) { err = br_set_ageing_time(br, nla_get_u32(data[IFLA_BR_AGEING_TIME])); if (err) return err; } if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); err = br_stp_set_enabled(br, stp_enabled, extack); if (err) return err; } if (data[IFLA_BR_PRIORITY]) { u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); br_stp_set_bridge_priority(br, priority); } if (data[IFLA_BR_VLAN_FILTERING]) { u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); err = br_vlan_filter_toggle(br, vlan_filter, extack); if (err) return err; } #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (data[IFLA_BR_VLAN_PROTOCOL]) { __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); err = __br_vlan_set_proto(br, vlan_proto, extack); if (err) return err; } if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); err = __br_vlan_set_default_pvid(br, defpvid, extack); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_ENABLED]) { __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]); err = br_vlan_set_stats(br, vlan_stats); if (err) return err; } if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); err = br_vlan_set_stats_per_port(br, per_port); if (err) return err; } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]); if (fwd_mask & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = fwd_mask; } if (data[IFLA_BR_GROUP_ADDR]) { u8 new_addr[ETH_ALEN]; if (nla_len(data[IFLA_BR_GROUP_ADDR]) != ETH_ALEN) return -EINVAL; memcpy(new_addr, nla_data(data[IFLA_BR_GROUP_ADDR]), ETH_ALEN); if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; spin_lock_bh(&br->lock); memcpy(br->group_addr, new_addr, sizeof(br->group_addr)); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); } if (data[IFLA_BR_FDB_FLUSH]) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (data[IFLA_BR_MCAST_ROUTER]) { u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); err = br_multicast_set_router(&br->multicast_ctx, multicast_router); if (err) return err; } if (data[IFLA_BR_MCAST_SNOOPING]) { u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); err = br_multicast_toggle(br, mcast_snooping, extack); if (err) return err; } if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { u8 val; val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]); br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); } if (data[IFLA_BR_MCAST_QUERIER]) { u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); err = br_multicast_set_querier(&br->multicast_ctx, mcast_querier); if (err) return err; } if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n", RHT_ELASTICITY); if (data[IFLA_BR_MCAST_HASH_MAX]) br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]); if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); br_multicast_set_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { __u8 mcast_stats; mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats); } if (data[IFLA_BR_MCAST_IGMP_VERSION]) { __u8 igmp_version; igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); err = br_multicast_set_igmp_version(&br->multicast_ctx, igmp_version); if (err) return err; } #if IS_ENABLED(CONFIG_IPV6) if (data[IFLA_BR_MCAST_MLD_VERSION]) { __u8 mld_version; mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); err = br_multicast_set_mld_version(&br->multicast_ctx, mld_version); if (err) return err; } #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); } if (data[IFLA_BR_NF_CALL_IP6TABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]); br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); } if (data[IFLA_BR_NF_CALL_ARPTABLES]) { u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]); br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); } #endif if (data[IFLA_BR_MULTI_BOOLOPT]) { struct br_boolopt_multi *bm; bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]); err = br_boolopt_multi_toggle(br, bm, extack); if (err) return err; } if (data[IFLA_BR_FDB_MAX_LEARNED]) { u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]); WRITE_ONCE(br->fdb_max_learned, val); } return 0; } static int br_dev_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); int err; err = register_netdevice(dev); if (err) return err; if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } err = br_changelink(dev, tb, data, extack); if (err) br_dev_delete(dev, NULL); return err; } static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_BRIDGE_ID */ nla_total_size(sizeof(u16)) + /* IFLA_BR_ROOT_PORT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_N_LEARNED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_MAX_LEARNED */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERY_USE_IFADDR */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_QUERIER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_STATS_ENABLED */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_ELASTICITY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */ nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */ #endif nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */ 0; } static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) { struct net_bridge *br = netdev_priv(brdev); u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; u8 vlan_enabled = br_vlan_enabled(br->dev); struct br_boolopt_multi bm; u64 clockval; clockval = br_timer_value(&br->hello_timer); if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->tcn_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->topology_change_timer); if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; br_boolopt_multi_get(br, &bm); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled) || nla_put_u16(skb, IFLA_BR_GROUP_FWD_MASK, br->group_fwd_mask) || nla_put(skb, IFLA_BR_BRIDGE_ID, sizeof(struct ifla_bridge_id), &br->bridge_id) || nla_put(skb, IFLA_BR_ROOT_ID, sizeof(struct ifla_bridge_id), &br->designated_root) || nla_put_u16(skb, IFLA_BR_ROOT_PORT, br->root_port) || nla_put_u32(skb, IFLA_BR_ROOT_PATH_COST, br->root_path_cost) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) || nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) || nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED, atomic_read(&br->fdb_n_learned)) || nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, br_opt_get(br, BROPT_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_ctx.multicast_querier) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, br->multicast_ctx.multicast_igmp_version) || br_multicast_dump_querier_state(skb, &br->multicast_ctx, IFLA_BR_MCAST_QUERIER_STATE)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES, br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES, br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) || nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES, br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0)) return -EMSGSIZE; #endif return 0; } static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) { struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; int numvls = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; vg = nbp_vlan_group(p); break; default: return 0; } if (vg) { /* we need to count all, even placeholder entries */ list_for_each_entry(v, &vg->vlan_list, vlist) numvls++; } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { struct nlattr *nla __maybe_unused; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge *br; struct nlattr *nest; int vl_idx = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: br = netdev_priv(dev); vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: p = br_port_get_rtnl(dev); if (!p) return 0; br = p->br; vg = nbp_vlan_group(p); break; default: return -EINVAL; } nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; if (vg) { u16 pvid; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; struct pcpu_sw_netstats stats; if (++vl_idx < *prividx) continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; vxi.flags = v->flags; if (v->vid == pvid) vxi.flags |= BRIDGE_VLAN_INFO_PVID; br_vlan_get_stats(v, &stats); vxi.rx_bytes = u64_stats_read(&stats.rx_bytes); vxi.rx_packets = u64_stats_read(&stats.rx_packets); vxi.tx_bytes = u64_stats_read(&stats.tx_bytes); vxi.tx_packets = u64_stats_read(&stats.tx_packets); if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) goto nla_put_failure; } } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (++vl_idx >= *prividx) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, sizeof(struct br_mcast_stats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; br_multicast_get_stats(br, p, nla_data(nla)); } #endif if (p) { nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP, sizeof(p->stp_xstats), BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; spin_lock_bh(&br->lock); memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats)); spin_unlock_bh(&br->lock); } nla_nest_end(skb, nest); *prividx = 0; return 0; nla_put_failure: nla_nest_end(skb, nest); *prividx = vl_idx; return -EMSGSIZE; } static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, }; struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, .maxtype = IFLA_BR_MAX, .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, .changelink = br_changelink, .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, .fill_linkxstats = br_fill_linkxstats, .get_linkxstats_size = br_get_linkxstats_size, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, .slave_changelink = br_port_slave_changelink, .get_slave_size = br_port_get_slave_size, .fill_slave_info = br_port_fill_slave_info, }; int __init br_netlink_init(void) { int err; err = br_vlan_rtnl_init(); if (err) goto out; err = rtnl_af_register(&br_af_ops); if (err) goto out_vlan; err = rtnl_link_register(&br_link_ops); if (err) goto out_af; return 0; out_af: rtnl_af_unregister(&br_af_ops); out_vlan: br_vlan_rtnl_uninit(); out: return err; } void br_netlink_fini(void) { br_vlan_rtnl_uninit(); rtnl_af_unregister(&br_af_ops); rtnl_link_unregister(&br_link_ops); }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 /* SPDX-License-Identifier: GPL-2.0-only */ /* * mac80211 <-> driver interface * * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #ifndef MAC80211_H #define MAC80211_H #include <linux/bug.h> #include <linux/kernel.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/ieee80211.h> #include <linux/lockdep.h> #include <net/cfg80211.h> #include <net/codel.h> #include <net/ieee80211_radiotap.h> #include <linux/unaligned.h> /** * DOC: Introduction * * mac80211 is the Linux stack for 802.11 hardware that implements * only partial functionality in hard- or firmware. This document * defines the interface between mac80211 and low-level hardware * drivers. */ /** * DOC: Calling mac80211 from interrupts * * Only ieee80211_tx_status_irqsafe() and ieee80211_rx_irqsafe() can be * called in hardware interrupt context. The low-level driver must not call any * other functions in hardware interrupt context. If there is a need for such * call, the low-level driver should first ACK the interrupt and perform the * IEEE 802.11 code call after this, e.g. from a scheduled workqueue or even * tasklet function. * * NOTE: If the driver opts to use the _irqsafe() functions, it may not also * use the non-IRQ-safe functions! */ /** * DOC: Warning * * If you're reading this document and not the header file itself, it will * be incomplete because not all documentation has been converted yet. */ /** * DOC: Frame format * * As a general rule, when frames are passed between mac80211 and the driver, * they start with the IEEE 802.11 header and include the same octets that are * sent over the air except for the FCS which should be calculated by the * hardware. * * There are, however, various exceptions to this rule for advanced features: * * The first exception is for hardware encryption and decryption offload * where the IV/ICV may or may not be generated in hardware. * * Secondly, when the hardware handles fragmentation, the frame handed to * the driver from mac80211 is the MSDU, not the MPDU. */ /** * DOC: mac80211 workqueue * * mac80211 provides its own workqueue for drivers and internal mac80211 use. * The workqueue is a single threaded workqueue and can only be accessed by * helpers for sanity checking. Drivers must ensure all work added onto the * mac80211 workqueue should be cancelled on the driver stop() callback. * * mac80211 will flush the workqueue upon interface removal and during * suspend. * * All work performed on the mac80211 workqueue must not acquire the RTNL lock. * */ /** * DOC: mac80211 software tx queueing * * mac80211 uses an intermediate queueing implementation, designed to allow the * driver to keep hardware queues short and to provide some fairness between * different stations/interfaces. * * Drivers must provide the .wake_tx_queue driver operation by either * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom * handler. * * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with * another per-sta for non-data/non-mgmt and bufferable management frames, and * a single per-vif queue for multicast data frames. * * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * * The driver can't access the internal TX queues (iTXQs) directly. * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue * driver op. * Drivers implementing a custom .wake_tx_queue op can get them by calling * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will * simply get the individual frames pushed via the .tx driver operation. * * Drivers can optionally delegate responsibility for scheduling queues to * mac80211, to take advantage of airtime fairness accounting. In this case, to * obtain the next queue to pull frames from, the driver calls * ieee80211_next_txq(). The driver is then expected to return the txq using * ieee80211_return_txq(). * * For AP powersave TIM handling, the driver only needs to indicate if it has * buffered packets in the driver specific data structures by calling * ieee80211_sta_set_buffered(). For frames buffered in the ieee80211_txq * struct, mac80211 sets the appropriate TIM PVB bits and calls * .release_buffered_frames(). * In that callback the driver is therefore expected to release its own * buffered frames and afterwards also frames from the ieee80211_txq (obtained * via the usual ieee80211_tx_dequeue). */ /** * DOC: HW timestamping * * Timing Measurement and Fine Timing Measurement require accurate timestamps * of the action frames TX/RX and their respective acks. * * To report hardware timestamps for Timing Measurement or Fine Timing * Measurement frame RX, the low level driver should set the SKB's hwtstamp * field to the frame RX timestamp and report the ack TX timestamp in the * ieee80211_rx_status struct. * * Similarly, to report hardware timestamps for Timing Measurement or Fine * Timing Measurement frame TX, the driver should set the SKB's hwtstamp field * to the frame TX timestamp and report the ack RX timestamp in the * ieee80211_tx_status struct. */ struct device; /** * enum ieee80211_max_queues - maximum number of queues * * @IEEE80211_MAX_QUEUES: Maximum number of regular device queues. * @IEEE80211_MAX_QUEUE_MAP: bitmap with maximum queues set */ enum ieee80211_max_queues { IEEE80211_MAX_QUEUES = 16, IEEE80211_MAX_QUEUE_MAP = BIT(IEEE80211_MAX_QUEUES) - 1, }; #define IEEE80211_INVAL_HW_QUEUE 0xff /** * enum ieee80211_ac_numbers - AC numbers as used in mac80211 * @IEEE80211_AC_VO: voice * @IEEE80211_AC_VI: video * @IEEE80211_AC_BE: best effort * @IEEE80211_AC_BK: background */ enum ieee80211_ac_numbers { IEEE80211_AC_VO = 0, IEEE80211_AC_VI = 1, IEEE80211_AC_BE = 2, IEEE80211_AC_BK = 3, }; /** * struct ieee80211_tx_queue_params - transmit queue configuration * * The information provided in this structure is required for QoS * transmit queue configuration. Cf. IEEE 802.11 7.3.2.29. * * @aifs: arbitration interframe space [0..255] * @cw_min: minimum contention window [a value of the form * 2^n-1 in the range 1..32767] * @cw_max: maximum contention window [like @cw_min] * @txop: maximum burst time in units of 32 usecs, 0 meaning disabled * @acm: is mandatory admission control required for the access category * @uapsd: is U-APSD mode enabled for the queue * @mu_edca: is the MU EDCA configured * @mu_edca_param_rec: MU EDCA Parameter Record for HE */ struct ieee80211_tx_queue_params { u16 txop; u16 cw_min; u16 cw_max; u8 aifs; bool acm; bool uapsd; bool mu_edca; struct ieee80211_he_mu_edca_param_ac_rec mu_edca_param_rec; }; struct ieee80211_low_level_stats { unsigned int dot11ACKFailureCount; unsigned int dot11RTSFailureCount; unsigned int dot11FCSErrorCount; unsigned int dot11RTSSuccessCount; }; /** * enum ieee80211_chanctx_change - change flag for channel context * @IEEE80211_CHANCTX_CHANGE_WIDTH: The channel width changed * @IEEE80211_CHANCTX_CHANGE_RX_CHAINS: The number of RX chains changed * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider * bandwidth) OFDMA settings need to be changed * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap * was changed. */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), IEEE80211_CHANCTX_CHANGE_AP = BIT(5), IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), }; /** * struct ieee80211_chan_req - A channel "request" * @oper: channel definition to use for operation * @ap: the channel definition of the AP, if any * (otherwise the chan member is %NULL) */ struct ieee80211_chan_req { struct cfg80211_chan_def oper; struct cfg80211_chan_def ap; }; /** * struct ieee80211_chanctx_conf - channel context that vifs may be tuned to * * This is the driver-visible part. The ieee80211_chanctx * that contains it is visible in mac80211 only. * * @def: the channel definition * @min_def: the minimum channel definition currently required. * @ap: the channel definition the AP actually is operating as, * for use with (wider bandwidth) OFDMA * @radio_idx: index of the wiphy radio used used for this channel * @rx_chains_static: The number of RX chains that must always be * active on the channel to receive MIMO transmissions * @rx_chains_dynamic: The number of RX chains that must be enabled * after RTS/CTS handshake to receive SMPS MIMO transmissions; * this will always be >= @rx_chains_static. * @radar_enabled: whether radar detection is enabled on this channel. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *), size is determined in hw information. */ struct ieee80211_chanctx_conf { struct cfg80211_chan_def def; struct cfg80211_chan_def min_def; struct cfg80211_chan_def ap; int radio_idx; u8 rx_chains_static, rx_chains_dynamic; bool radar_enabled; u8 drv_priv[] __aligned(sizeof(void *)); }; /** * enum ieee80211_chanctx_switch_mode - channel context switch mode * @CHANCTX_SWMODE_REASSIGN_VIF: Both old and new contexts already * exist (and will continue to exist), but the virtual interface * needs to be switched from one to the other. * @CHANCTX_SWMODE_SWAP_CONTEXTS: The old context exists but will stop * to exist with this call, the new context doesn't exist but * will be active after this call, the virtual interface switches * from the old to the new (note that the driver may of course * implement this as an on-the-fly chandef switch of the existing * hardware context, but the mac80211 pointer for the old context * will cease to exist and only the new one will later be used * for changes/removal.) */ enum ieee80211_chanctx_switch_mode { CHANCTX_SWMODE_REASSIGN_VIF, CHANCTX_SWMODE_SWAP_CONTEXTS, }; /** * struct ieee80211_vif_chanctx_switch - vif chanctx switch information * * This is structure is used to pass information about a vif that * needs to switch from one chanctx to another. The * &ieee80211_chanctx_switch_mode defines how the switch should be * done. * * @vif: the vif that should be switched from old_ctx to new_ctx * @link_conf: the link conf that's switching * @old_ctx: the old context to which the vif was assigned * @new_ctx: the new context to which the vif must be assigned */ struct ieee80211_vif_chanctx_switch { struct ieee80211_vif *vif; struct ieee80211_bss_conf *link_conf; struct ieee80211_chanctx_conf *old_ctx; struct ieee80211_chanctx_conf *new_ctx; }; /** * enum ieee80211_bss_change - BSS change notification flags * * These flags are used with the bss_info_changed(), link_info_changed() * and vif_cfg_changed() callbacks to indicate which parameter(s) changed. * * @BSS_CHANGED_ASSOC: association status changed (associated/disassociated), * also implies a change in the AID. * @BSS_CHANGED_ERP_CTS_PROT: CTS protection changed * @BSS_CHANGED_ERP_PREAMBLE: preamble changed * @BSS_CHANGED_ERP_SLOT: slot timing changed * @BSS_CHANGED_HT: 802.11n parameters changed * @BSS_CHANGED_BASIC_RATES: Basic rateset changed * @BSS_CHANGED_BEACON_INT: Beacon interval changed * @BSS_CHANGED_BSSID: BSSID changed, for whatever * reason (IBSS and managed mode) * @BSS_CHANGED_BEACON: Beacon data changed, retrieve * new beacon (beaconing modes) * @BSS_CHANGED_BEACON_ENABLED: Beaconing should be * enabled/disabled (beaconing modes) * @BSS_CHANGED_CQM: Connection quality monitor config changed * @BSS_CHANGED_IBSS: IBSS join status changed * @BSS_CHANGED_ARP_FILTER: Hardware ARP filter address list or state changed. * @BSS_CHANGED_QOS: QoS for this association was enabled/disabled. Note * that it is only ever disabled for station mode. * @BSS_CHANGED_IDLE: Idle changed for this BSS/interface. * @BSS_CHANGED_SSID: SSID changed for this BSS (AP and IBSS mode) * @BSS_CHANGED_AP_PROBE_RESP: Probe Response changed for this BSS (AP mode) * @BSS_CHANGED_PS: PS changed for this BSS (STA mode) * @BSS_CHANGED_TXPOWER: TX power setting changed for this interface * @BSS_CHANGED_P2P_PS: P2P powersave settings (CTWindow, opportunistic PS) * changed * @BSS_CHANGED_BEACON_INFO: Data from the AP's beacon became available: * currently dtim_period only is under consideration. * @BSS_CHANGED_BANDWIDTH: The bandwidth used by this interface changed, * note that this is only called when it changes after the channel * context had been assigned. * @BSS_CHANGED_OCB: OCB join status changed * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected * keep alive) changed. * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder * functionality changed for this BSS (AP mode). * @BSS_CHANGED_TWT: TWT status changed * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. * @BSS_CHANGED_HE_BSS_COLOR: BSS Color has changed * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed * @BSS_CHANGED_TPE: transmit power envelope changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, BSS_CHANGED_ERP_CTS_PROT = 1<<1, BSS_CHANGED_ERP_PREAMBLE = 1<<2, BSS_CHANGED_ERP_SLOT = 1<<3, BSS_CHANGED_HT = 1<<4, BSS_CHANGED_BASIC_RATES = 1<<5, BSS_CHANGED_BEACON_INT = 1<<6, BSS_CHANGED_BSSID = 1<<7, BSS_CHANGED_BEACON = 1<<8, BSS_CHANGED_BEACON_ENABLED = 1<<9, BSS_CHANGED_CQM = 1<<10, BSS_CHANGED_IBSS = 1<<11, BSS_CHANGED_ARP_FILTER = 1<<12, BSS_CHANGED_QOS = 1<<13, BSS_CHANGED_IDLE = 1<<14, BSS_CHANGED_SSID = 1<<15, BSS_CHANGED_AP_PROBE_RESP = 1<<16, BSS_CHANGED_PS = 1<<17, BSS_CHANGED_TXPOWER = 1<<18, BSS_CHANGED_P2P_PS = 1<<19, BSS_CHANGED_BEACON_INFO = 1<<20, BSS_CHANGED_BANDWIDTH = 1<<21, BSS_CHANGED_OCB = 1<<22, BSS_CHANGED_MU_GROUPS = 1<<23, BSS_CHANGED_KEEP_ALIVE = 1<<24, BSS_CHANGED_MCAST_RATE = 1<<25, BSS_CHANGED_FTM_RESPONDER = 1<<26, BSS_CHANGED_TWT = 1<<27, BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = BIT_ULL(31), BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), BSS_CHANGED_MLD_TTLM = BIT_ULL(34), BSS_CHANGED_TPE = BIT_ULL(35), /* when adding here, make sure to change ieee80211_reconfig */ }; /* * The maximum number of IPv4 addresses listed for ARP filtering. If the number * of addresses for an interface increase beyond this value, hardware ARP * filtering will be disabled. */ #define IEEE80211_BSS_ARP_ADDR_LIST_LEN 4 /** * enum ieee80211_event_type - event to be notified to the low level driver * @RSSI_EVENT: AP's rssi crossed the a threshold set by the driver. * @MLME_EVENT: event related to MLME * @BAR_RX_EVENT: a BAR was received * @BA_FRAME_TIMEOUT: Frames were released from the reordering buffer because * they timed out. This won't be called for each frame released, but only * once each time the timeout triggers. */ enum ieee80211_event_type { RSSI_EVENT, MLME_EVENT, BAR_RX_EVENT, BA_FRAME_TIMEOUT, }; /** * enum ieee80211_rssi_event_data - relevant when event type is %RSSI_EVENT * @RSSI_EVENT_HIGH: AP's rssi went below the threshold set by the driver. * @RSSI_EVENT_LOW: AP's rssi went above the threshold set by the driver. */ enum ieee80211_rssi_event_data { RSSI_EVENT_HIGH, RSSI_EVENT_LOW, }; /** * struct ieee80211_rssi_event - data attached to an %RSSI_EVENT * @data: See &enum ieee80211_rssi_event_data */ struct ieee80211_rssi_event { enum ieee80211_rssi_event_data data; }; /** * enum ieee80211_mlme_event_data - relevant when event type is %MLME_EVENT * @AUTH_EVENT: the MLME operation is authentication * @ASSOC_EVENT: the MLME operation is association * @DEAUTH_RX_EVENT: deauth received.. * @DEAUTH_TX_EVENT: deauth sent. */ enum ieee80211_mlme_event_data { AUTH_EVENT, ASSOC_EVENT, DEAUTH_RX_EVENT, DEAUTH_TX_EVENT, }; /** * enum ieee80211_mlme_event_status - relevant when event type is %MLME_EVENT * @MLME_SUCCESS: the MLME operation completed successfully. * @MLME_DENIED: the MLME operation was denied by the peer. * @MLME_TIMEOUT: the MLME operation timed out. */ enum ieee80211_mlme_event_status { MLME_SUCCESS, MLME_DENIED, MLME_TIMEOUT, }; /** * struct ieee80211_mlme_event - data attached to an %MLME_EVENT * @data: See &enum ieee80211_mlme_event_data * @status: See &enum ieee80211_mlme_event_status * @reason: the reason code if applicable */ struct ieee80211_mlme_event { enum ieee80211_mlme_event_data data; enum ieee80211_mlme_event_status status; u16 reason; }; /** * struct ieee80211_ba_event - data attached for BlockAck related events * @sta: pointer to the &ieee80211_sta to which this event relates * @tid: the tid * @ssn: the starting sequence number (for %BAR_RX_EVENT) */ struct ieee80211_ba_event { struct ieee80211_sta *sta; u16 tid; u16 ssn; }; /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. * @u.rssi: relevant if &type is %RSSI_EVENT * @u.mlme: relevant if &type is %AUTH_EVENT * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { enum ieee80211_event_type type; union { struct ieee80211_rssi_event rssi; struct ieee80211_mlme_event mlme; struct ieee80211_ba_event ba; } u; }; /** * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data * * This structure describes the group id data of VHT MU-MIMO * * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group */ struct ieee80211_mu_group_data { u8 membership[WLAN_MEMBERSHIP_LEN]; u8 position[WLAN_USER_POSITION_LEN]; }; /** * struct ieee80211_ftm_responder_params - FTM responder parameters * * @lci: LCI subelement content * @civicloc: CIVIC location subelement content * @lci_len: LCI data length * @civicloc_len: Civic data length */ struct ieee80211_ftm_responder_params { const u8 *lci; const u8 *civicloc; size_t lci_len; size_t civicloc_len; }; /** * struct ieee80211_fils_discovery - FILS discovery parameters from * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. * * @min_interval: Minimum packet interval in TUs (0 - 10000) * @max_interval: Maximum packet interval in TUs (0 - 10000) */ struct ieee80211_fils_discovery { u32 min_interval; u32 max_interval; }; #define IEEE80211_TPE_EIRP_ENTRIES_320MHZ 5 struct ieee80211_parsed_tpe_eirp { bool valid; s8 power[IEEE80211_TPE_EIRP_ENTRIES_320MHZ]; u8 count; }; #define IEEE80211_TPE_PSD_ENTRIES_320MHZ 16 struct ieee80211_parsed_tpe_psd { bool valid; s8 power[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; u8 count, n; }; /** * struct ieee80211_parsed_tpe - parsed transmit power envelope information * @max_local: maximum local EIRP, one value for 20, 40, 80, 160, 320 MHz each * (indexed by TX power category) * @max_reg_client: maximum regulatory client EIRP, one value for 20, 40, 80, * 160, 320 MHz each * (indexed by TX power category) * @psd_local: maximum local power spectral density, one value for each 20 MHz * subchannel per bss_conf's chanreq.oper * (indexed by TX power category) * @psd_reg_client: maximum regulatory power spectral density, one value for * each 20 MHz subchannel per bss_conf's chanreq.oper * (indexed by TX power category) */ struct ieee80211_parsed_tpe { struct ieee80211_parsed_tpe_eirp max_local[2], max_reg_client[2]; struct ieee80211_parsed_tpe_psd psd_local[2], psd_reg_client[2]; }; /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * * This structure keeps information about a BSS (and an association * to that BSS) that can change during the lifetime of the BSS. * * @vif: reference to owning VIF * @bss: the cfg80211 bss descriptor. Valid only for a station, and only * when associated. Note: This contains information which is not * necessarily authenticated. For example, information coming from probe * responses. * @addr: (link) address used locally * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE * @uora_exists: is the UORA element advertised by AP * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE * @twt_requester: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_responder: does this BSS support TWT requester (relevant for managed * mode only, set if the AP advertises TWT responder role) * @twt_protected: does this BSS support protected TWT frames * @twt_broadcast: does this BSS support broadcast TWT * @use_cts_prot: use CTS protection * @use_short_preamble: use 802.11b short preamble * @use_short_slot: use short slot time (only relevant for ERP) * @dtim_period: num of beacons before the next DTIM, for beaconing, * valid in station mode only if after the driver was notified * with the %BSS_CHANGED_BEACON_INFO flag, will be non-zero then. * @sync_tsf: last beacon's/probe response's TSF timestamp (could be old * as it may have been received during scanning long ago). If the * HW flag %IEEE80211_HW_TIMING_BEACON_ONLY is set, then this can * only come from a beacon, but might not become valid until after * association when a beacon is received (which is notified with the * %BSS_CHANGED_DTIM flag.). See also sync_dtim_count important notice. * @sync_device_ts: the device timestamp corresponding to the sync_tsf, * the driver/device can use this to calculate synchronisation * (see @sync_tsf). See also sync_dtim_count important notice. * @sync_dtim_count: Only valid when %IEEE80211_HW_TIMING_BEACON_ONLY * is requested, see @sync_tsf/@sync_device_ts. * IMPORTANT: These three sync_* parameters would possibly be out of sync * by the time the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. * Note also that this is not used with MLD associations, mac80211 doesn't * know how to track beacons for all of the links for this. * @beacon_int: beacon interval * @assoc_capability: capabilities taken from assoc resp * @basic_rates: bitmap of basic rates, each bit stands for an * index into the rate table configured by the driver in * the current band. * @beacon_rate: associated AP's beacon TX rate * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @bssid: The BSSID for this BSS * @enable_beacon: whether beaconing should be enabled or not * @chanreq: Channel request for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. * This field is only valid when the channel is a wide HT/VHT channel. * Note that with TDLS this can be the case (channel is HT, protection must * be used from this field) even when the BSS association isn't using HT. * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value * implies disabled. As with the cfg80211 callback, a change here should * cause an event to be sent indicating where the current value is in * relation to the newly configured threshold. * @cqm_rssi_low: Connection quality monitor RSSI lower threshold, a zero value * implies disabled. This is an alternative mechanism to the single * threshold event and can't be enabled simultaneously with it. * @cqm_rssi_high: Connection quality monitor RSSI upper threshold. * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis * @qos: This is a QoS-enabled BSS. * @hidden_ssid: The SSID of the current vif is hidden. Only valid in AP-mode. * @txpower: TX power in dBm. INT_MIN means not configured. * @txpower_type: TX power adjustment used to control per packet Transmit * Power Control (TPC) in lower driver for the current vif. In particular * TPC is enabled if value passed in %txpower_type is * NL80211_TX_POWER_LIMITED (allow using less than specified from * userspace), whereas TPC is disabled if %txpower_type is set to * NL80211_TX_POWER_FIXED (use value configured from userspace) * @p2p_noa_attr: P2P NoA attribute for P2P powersave * @allow_p2p_go_ps: indication for AP or P2P GO interface, whether it's allowed * to use P2P PS mechanism or not. AP/P2P GO is not allowed to use P2P PS * if it has associated clients without P2P PS support. * @max_idle_period: the time period during which the station can refrain from * transmitting frames to its associated AP without being disassociated. * In units of 1000 TUs. Zero value indicates that the AP did not include * a (valid) BSS Max Idle Period Element. * @protected_keep_alive: if set, indicates that the station should send an RSN * protected frame to the AP to reset the idle timer at the AP for the * station. * @ftm_responder: whether to enable or disable fine timing measurement FTM * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set * @ema_ap: AP supports enhancements of discovery and advertisement of * nontransmitted BSSIDs * @profile_periodicity: the least number of beacon frames need to be received * in order to discover all the nontransmitted BSSIDs in the set. * @he_oper: HE operation information of the BSS (AP/Mesh) or of the AP we are * connected to (STA) * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE * @fils_discovery: FILS discovery configuration * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response * interval. * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed * to driver when rate control is offloaded to firmware. * @power_type: power type of BSS for 6 GHz * @tpe: transmit power envelope information * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT * @csa_active: marks whether a channel switch is going on. * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL * when it is not assigned. This pointer is RCU-protected due to the TX * path needing to access it; even though the netdev carrier will always * be off when it is %NULL there can still be races and packets could be * processed after it switches back to %NULL. * @color_change_active: marks whether a color change is ongoing. * @color_change_color: the bss color that will be used after the change. * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability. * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability. * @he_ldpc: in AP mode, indicates interface has HE LDPC capability. * @vht_su_beamformer: in AP mode, does this BSS support operation as an VHT SU * beamformer * @vht_su_beamformee: in AP mode, does this BSS support operation as an VHT SU * beamformee * @vht_mu_beamformer: in AP mode, does this BSS support operation as an VHT MU * beamformer * @vht_mu_beamformee: in AP mode, does this BSS support operation as an VHT MU * beamformee * @he_su_beamformer: in AP-mode, does this BSS support operation as an HE SU * beamformer * @he_su_beamformee: in AP-mode, does this BSS support operation as an HE SU * beamformee * @he_mu_beamformer: in AP-mode, does this BSS support operation as an HE MU * beamformer * @he_full_ul_mumimo: does this BSS support the reception (AP) or transmission * (non-AP STA) of an HE TB PPDU on an RU that spans the entire PPDU * bandwidth * @eht_su_beamformer: in AP-mode, does this BSS enable operation as an EHT SU * beamformer * @eht_su_beamformee: in AP-mode, does this BSS enable operation as an EHT SU * beamformee * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU * beamformer * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This * information is the latest known value. It can come from this link's * beacon or from a beacon sent by another link. * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear * its beacons, and link 2 sent a beacon with an RNR element that updated * link 1's BSS params change count, then, link 1's * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that * link 2 was the link that updated its bss_param_ch_cnt value. * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will * be updated to 1, even if bss_param_ch_cnt didn't change. This allows * the link to know that it heard the latest value from its own beacon * (as opposed to hearing its value from another link's beacon). */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; struct cfg80211_bss *bss; const u8 *bssid; unsigned int link_id; u8 addr[ETH_ALEN] __aligned(2); u8 htc_trig_based_pkt_ext; bool uora_exists; u8 uora_ocw_range; u16 frame_time_rts_th; bool he_support; bool twt_requester; bool twt_responder; bool twt_protected; bool twt_broadcast; /* erp related data */ bool use_cts_prot; bool use_short_preamble; bool use_short_slot; bool enable_beacon; u8 dtim_period; u16 beacon_int; u16 assoc_capability; u64 sync_tsf; u32 sync_device_ts; u8 sync_dtim_count; u32 basic_rates; struct ieee80211_rate *beacon_rate; int mcast_rate[NUM_NL80211_BANDS]; u16 ht_operation_mode; s32 cqm_rssi_thold; u32 cqm_rssi_hyst; s32 cqm_rssi_low; s32 cqm_rssi_high; struct ieee80211_chan_req chanreq; struct ieee80211_mu_group_data mu_group; bool qos; bool hidden_ssid; int txpower; enum nl80211_tx_power_setting txpower_type; struct ieee80211_p2p_noa_attr p2p_noa_attr; bool allow_p2p_go_ps; u16 max_idle_period; bool protected_keep_alive; bool ftm_responder; struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; bool ema_ap; u8 profile_periodicity; struct { u32 params; u16 nss_set; } he_oper; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct ieee80211_fils_discovery fils_discovery; u32 unsol_bcast_probe_resp_interval; struct cfg80211_bitrate_mask beacon_tx_rate; enum ieee80211_ap_reg_power power_type; struct ieee80211_parsed_tpe tpe; u8 pwr_reduction; bool eht_support; bool csa_active; bool mu_mimo_owner; struct ieee80211_chanctx_conf __rcu *chanctx_conf; bool color_change_active; u8 color_change_color; bool ht_ldpc; bool vht_ldpc; bool he_ldpc; bool vht_su_beamformer; bool vht_su_beamformee; bool vht_mu_beamformer; bool vht_mu_beamformee; bool he_su_beamformer; bool he_su_beamformee; bool he_mu_beamformer; bool he_full_ul_mumimo; bool eht_su_beamformer; bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; }; /** * enum mac80211_tx_info_flags - flags to describe transmission information/status * * These flags are used with the @flags member of &ieee80211_tx_info. * * @IEEE80211_TX_CTL_REQ_TX_STATUS: require TX status callback for this frame. * @IEEE80211_TX_CTL_ASSIGN_SEQ: The driver has to assign a sequence * number to this frame, taking care of not overwriting the fragment * number and increasing the sequence number only when the * IEEE80211_TX_CTL_FIRST_FRAGMENT flag is set. mac80211 will properly * assign sequence numbers to QoS-data frames but cannot do so correctly * for non-QoS-data and management frames because beacons need them from * that counter as well and mac80211 cannot guarantee proper sequencing. * If this flag is set, the driver should instruct the hardware to * assign a sequence number to the frame or assign one itself. Cf. IEEE * 802.11-2007 7.1.3.4.1 paragraph 3. This flag will always be set for * beacons and always be clear for frames without a sequence number field. * @IEEE80211_TX_CTL_NO_ACK: tell the low level not to wait for an ack * @IEEE80211_TX_CTL_CLEAR_PS_FILT: clear powersave filter for destination * station * @IEEE80211_TX_CTL_FIRST_FRAGMENT: this is a first fragment of the frame * @IEEE80211_TX_CTL_SEND_AFTER_DTIM: send this frame after DTIM beacon * @IEEE80211_TX_CTL_AMPDU: this frame should be sent as part of an A-MPDU * @IEEE80211_TX_CTL_INJECTED: Frame was injected, internal to mac80211. * @IEEE80211_TX_STAT_TX_FILTERED: The frame was not transmitted * because the destination STA was in powersave mode. Note that to * avoid race conditions, the filter must be set by the hardware or * firmware upon receiving a frame that indicates that the station * went to sleep (must be done on device to filter frames already on * the queue) and may only be unset after mac80211 gives the OK for * that by setting the IEEE80211_TX_CTL_CLEAR_PS_FILT (see above), * since only then is it guaranteed that no more frames are in the * hardware queue. * @IEEE80211_TX_STAT_ACK: Frame was acknowledged * @IEEE80211_TX_STAT_AMPDU: The frame was aggregated, so status * is for the whole aggregation. * @IEEE80211_TX_STAT_AMPDU_NO_BACK: no block ack was returned, * so consider using block ack request (BAR). * @IEEE80211_TX_CTL_RATE_CTRL_PROBE: internal to mac80211, can be * set by rate control algorithms to indicate probe rate, will * be cleared for fragmented frames (except on the last fragment) * @IEEE80211_TX_INTFL_OFFCHAN_TX_OK: Internal to mac80211. Used to indicate * that a frame can be transmitted while the queues are stopped for * off-channel operation. * @IEEE80211_TX_CTL_HW_80211_ENCAP: This frame uses hardware encapsulation * (header conversion) * @IEEE80211_TX_INTFL_RETRIED: completely internal to mac80211, * used to indicate that a frame was already retried due to PS * @IEEE80211_TX_INTFL_DONT_ENCRYPT: completely internal to mac80211, * used to indicate frame should not be encrypted * @IEEE80211_TX_CTL_NO_PS_BUFFER: This frame is a response to a poll * frame (PS-Poll or uAPSD) or a non-bufferable MMPDU and must * be sent although the station is in powersave mode. * @IEEE80211_TX_CTL_MORE_FRAMES: More frames will be passed to the * transmit function after the current frame, this can be used * by drivers to kick the DMA queue only if unset or when the * queue gets full. * @IEEE80211_TX_INTFL_RETRANSMISSION: This frame is being retransmitted * after TX status because the destination was asleep, it must not * be modified again (no seqno assignment, crypto, etc.) * @IEEE80211_TX_INTFL_MLME_CONN_TX: This frame was transmitted by the MLME * code for connection establishment, this indicates that its status * should kick the MLME state machine. * @IEEE80211_TX_INTFL_NL80211_FRAME_TX: Frame was requested through nl80211 * MLME command (internal to mac80211 to figure out whether to send TX * status to user space) * @IEEE80211_TX_CTL_LDPC: tells the driver to use LDPC for this frame * @IEEE80211_TX_CTL_STBC: Enables Space-Time Block Coding (STBC) for this * frame and selects the maximum number of streams that it can use. * @IEEE80211_TX_CTL_TX_OFFCHAN: Marks this packet to be transmitted on * the off-channel channel when a remain-on-channel offload is done * in hardware -- normal packets still flow and are expected to be * handled properly by the device. * @IEEE80211_TX_INTFL_TKIP_MIC_FAILURE: Marks this packet to be used for TKIP * testing. It will be sent out with incorrect Michael MIC key to allow * TKIP countermeasures to be tested. * @IEEE80211_TX_CTL_NO_CCK_RATE: This frame will be sent at non CCK rate. * This flag is actually used for management frame especially for P2P * frames not being sent at CCK rate in 2GHz band. * @IEEE80211_TX_STATUS_EOSP: This packet marks the end of service period, * when its status is reported the service period ends. For frames in * an SP that mac80211 transmits, it is already set; for driver frames * the driver may set this flag. It is also used to do the same for * PS-Poll responses. * @IEEE80211_TX_CTL_USE_MINRATE: This frame will be sent at lowest rate. * This flag is used to send nullfunc frame at minimum rate when * the nullfunc is used for connection monitoring purpose. * @IEEE80211_TX_CTL_DONTFRAG: Don't fragment this packet even if it * would be fragmented by size (this is optional, only used for * monitor injection). * @IEEE80211_TX_STAT_NOACK_TRANSMITTED: A frame that was marked with * IEEE80211_TX_CTL_NO_ACK has been successfully transmitted without * any errors (like issues specific to the driver/HW). * This flag must not be set for frames that don't request no-ack * behaviour with IEEE80211_TX_CTL_NO_ACK. * * Note: If you have to add new flags to the enumeration, then don't * forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary. */ enum mac80211_tx_info_flags { IEEE80211_TX_CTL_REQ_TX_STATUS = BIT(0), IEEE80211_TX_CTL_ASSIGN_SEQ = BIT(1), IEEE80211_TX_CTL_NO_ACK = BIT(2), IEEE80211_TX_CTL_CLEAR_PS_FILT = BIT(3), IEEE80211_TX_CTL_FIRST_FRAGMENT = BIT(4), IEEE80211_TX_CTL_SEND_AFTER_DTIM = BIT(5), IEEE80211_TX_CTL_AMPDU = BIT(6), IEEE80211_TX_CTL_INJECTED = BIT(7), IEEE80211_TX_STAT_TX_FILTERED = BIT(8), IEEE80211_TX_STAT_ACK = BIT(9), IEEE80211_TX_STAT_AMPDU = BIT(10), IEEE80211_TX_STAT_AMPDU_NO_BACK = BIT(11), IEEE80211_TX_CTL_RATE_CTRL_PROBE = BIT(12), IEEE80211_TX_INTFL_OFFCHAN_TX_OK = BIT(13), IEEE80211_TX_CTL_HW_80211_ENCAP = BIT(14), IEEE80211_TX_INTFL_RETRIED = BIT(15), IEEE80211_TX_INTFL_DONT_ENCRYPT = BIT(16), IEEE80211_TX_CTL_NO_PS_BUFFER = BIT(17), IEEE80211_TX_CTL_MORE_FRAMES = BIT(18), IEEE80211_TX_INTFL_RETRANSMISSION = BIT(19), IEEE80211_TX_INTFL_MLME_CONN_TX = BIT(20), IEEE80211_TX_INTFL_NL80211_FRAME_TX = BIT(21), IEEE80211_TX_CTL_LDPC = BIT(22), IEEE80211_TX_CTL_STBC = BIT(23) | BIT(24), IEEE80211_TX_CTL_TX_OFFCHAN = BIT(25), IEEE80211_TX_INTFL_TKIP_MIC_FAILURE = BIT(26), IEEE80211_TX_CTL_NO_CCK_RATE = BIT(27), IEEE80211_TX_STATUS_EOSP = BIT(28), IEEE80211_TX_CTL_USE_MINRATE = BIT(29), IEEE80211_TX_CTL_DONTFRAG = BIT(30), IEEE80211_TX_STAT_NOACK_TRANSMITTED = BIT(31), }; #define IEEE80211_TX_CTL_STBC_SHIFT 23 #define IEEE80211_TX_RC_S1G_MCS IEEE80211_TX_RC_VHT_MCS /** * enum mac80211_tx_control_flags - flags to describe transmit control * * @IEEE80211_TX_CTRL_PORT_CTRL_PROTO: this frame is a port control * protocol frame (e.g. EAP) * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll * frame (PS-Poll or uAPSD). * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup * @IEEE80211_TX_INTCFL_NEED_TXPROCESSING: completely internal to mac80211, * used to indicate that a pending frame requires TX processing before * it can be sent out. * @IEEE80211_TX_CTRL_NO_SEQNO: Do not overwrite the sequence number that * has already been assigned to this frame. * @IEEE80211_TX_CTRL_DONT_REORDER: This frame should not be reordered * relative to other frames that have this flag set, independent * of their QoS TID or other priority field values. * @IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX: first MLO TX, used mostly internally * for sequence number assignment * @IEEE80211_TX_CTRL_DONT_USE_RATE_MASK: Don't use rate mask for this frame * which is transmitted due to scanning or offchannel TX, not in normal * operation on the interface. * @IEEE80211_TX_CTRL_MLO_LINK: If not @IEEE80211_LINK_UNSPECIFIED, this * frame should be transmitted on the specific link. This really is * only relevant for frames that do not have data present, and is * also not used for 802.3 format frames. Note that even if the frame * is on a specific link, address translation might still apply if * it's intended for an MLD. * * These flags are used in tx_info->control.flags. */ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0), IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1), IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), IEEE80211_TX_CTRL_AMSDU = BIT(3), IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), IEEE80211_TX_INTCFL_NEED_TXPROCESSING = BIT(6), IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), IEEE80211_TX_CTRL_MCAST_MLO_FIRST_TX = BIT(9), IEEE80211_TX_CTRL_DONT_USE_RATE_MASK = BIT(10), IEEE80211_TX_CTRL_MLO_LINK = 0xf0000000, }; #define IEEE80211_LINK_UNSPECIFIED 0xf #define IEEE80211_TX_CTRL_MLO_LINK_UNSPEC \ u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, \ IEEE80211_TX_CTRL_MLO_LINK) /** * enum mac80211_tx_status_flags - flags to describe transmit status * * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid * * These flags are used in tx_info->status.flags. */ enum mac80211_tx_status_flags { IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0), }; /* * This definition is used as a mask to clear all temporary flags, which are * set by the tx handlers for each transmission attempt by the mac80211 stack. */ #define IEEE80211_TX_TEMPORARY_FLAGS (IEEE80211_TX_CTL_NO_ACK | \ IEEE80211_TX_CTL_CLEAR_PS_FILT | IEEE80211_TX_CTL_FIRST_FRAGMENT | \ IEEE80211_TX_CTL_SEND_AFTER_DTIM | IEEE80211_TX_CTL_AMPDU | \ IEEE80211_TX_STAT_TX_FILTERED | IEEE80211_TX_STAT_ACK | \ IEEE80211_TX_STAT_AMPDU | IEEE80211_TX_STAT_AMPDU_NO_BACK | \ IEEE80211_TX_CTL_RATE_CTRL_PROBE | IEEE80211_TX_CTL_NO_PS_BUFFER | \ IEEE80211_TX_CTL_MORE_FRAMES | IEEE80211_TX_CTL_LDPC | \ IEEE80211_TX_CTL_STBC | IEEE80211_TX_STATUS_EOSP) /** * enum mac80211_rate_control_flags - per-rate flags set by the * Rate Control algorithm. * * These flags are set by the Rate control algorithm for each rate during tx, * in the @flags member of struct ieee80211_tx_rate. * * @IEEE80211_TX_RC_USE_RTS_CTS: Use RTS/CTS exchange for this rate. * @IEEE80211_TX_RC_USE_CTS_PROTECT: CTS-to-self protection is required. * This is set if the current BSS requires ERP protection. * @IEEE80211_TX_RC_USE_SHORT_PREAMBLE: Use short preamble. * @IEEE80211_TX_RC_MCS: HT rate. * @IEEE80211_TX_RC_VHT_MCS: VHT MCS rate, in this case the idx field is split * into a higher 4 bits (Nss) and lower 4 bits (MCS number) * @IEEE80211_TX_RC_GREEN_FIELD: Indicates whether this rate should be used in * Greenfield mode. * @IEEE80211_TX_RC_40_MHZ_WIDTH: Indicates if the Channel Width should be 40 MHz. * @IEEE80211_TX_RC_80_MHZ_WIDTH: Indicates 80 MHz transmission * @IEEE80211_TX_RC_160_MHZ_WIDTH: Indicates 160 MHz transmission * (80+80 isn't supported yet) * @IEEE80211_TX_RC_DUP_DATA: The frame should be transmitted on both of the * adjacent 20 MHz channels, if the current channel type is * NL80211_CHAN_HT40MINUS or NL80211_CHAN_HT40PLUS. * @IEEE80211_TX_RC_SHORT_GI: Short Guard interval should be used for this rate. */ enum mac80211_rate_control_flags { IEEE80211_TX_RC_USE_RTS_CTS = BIT(0), IEEE80211_TX_RC_USE_CTS_PROTECT = BIT(1), IEEE80211_TX_RC_USE_SHORT_PREAMBLE = BIT(2), /* rate index is an HT/VHT MCS instead of an index */ IEEE80211_TX_RC_MCS = BIT(3), IEEE80211_TX_RC_GREEN_FIELD = BIT(4), IEEE80211_TX_RC_40_MHZ_WIDTH = BIT(5), IEEE80211_TX_RC_DUP_DATA = BIT(6), IEEE80211_TX_RC_SHORT_GI = BIT(7), IEEE80211_TX_RC_VHT_MCS = BIT(8), IEEE80211_TX_RC_80_MHZ_WIDTH = BIT(9), IEEE80211_TX_RC_160_MHZ_WIDTH = BIT(10), }; /* there are 40 bytes if you don't need the rateset to be kept */ #define IEEE80211_TX_INFO_DRIVER_DATA_SIZE 40 /* if you do need the rateset, then you have less space */ #define IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE 24 /* maximum number of rate stages */ #define IEEE80211_TX_MAX_RATES 4 /* maximum number of rate table entries */ #define IEEE80211_TX_RATE_TABLE_SIZE 4 /** * struct ieee80211_tx_rate - rate selection/status * * @idx: rate index to attempt to send with * @flags: rate control flags (&enum mac80211_rate_control_flags) * @count: number of tries in this rate before going to the next rate * * A value of -1 for @idx indicates an invalid rate and, if used * in an array of retry rates, that no more rates should be tried. * * When used for transmit status reporting, the driver should * always report the rate along with the flags it used. * * &struct ieee80211_tx_info contains an array of these structs * in the control information, and it will be filled by the rate * control algorithm according to what should be sent. For example, * if this array contains, in the format { <idx>, <count> } the * information:: * * { 3, 2 }, { 2, 2 }, { 1, 4 }, { -1, 0 }, { -1, 0 } * * then this means that the frame should be transmitted * up to twice at rate 3, up to twice at rate 2, and up to four * times at rate 1 if it doesn't get acknowledged. Say it gets * acknowledged by the peer after the fifth attempt, the status * information should then contain:: * * { 3, 2 }, { 2, 2 }, { 1, 1 }, { -1, 0 } ... * * since it was transmitted twice at rate 3, twice at rate 2 * and once at rate 1 after which we received an acknowledgement. */ struct ieee80211_tx_rate { s8 idx; u16 count:5, flags:11; } __packed; #define IEEE80211_MAX_TX_RETRY 31 static inline bool ieee80211_rate_valid(struct ieee80211_tx_rate *rate) { return rate->idx >= 0 && rate->count > 0; } static inline void ieee80211_rate_set_vht(struct ieee80211_tx_rate *rate, u8 mcs, u8 nss) { WARN_ON(mcs & ~0xF); WARN_ON((nss - 1) & ~0x7); rate->idx = ((nss - 1) << 4) | mcs; } static inline u8 ieee80211_rate_get_vht_mcs(const struct ieee80211_tx_rate *rate) { return rate->idx & 0xF; } static inline u8 ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) { return (rate->idx >> 4) + 1; } /** * struct ieee80211_tx_info - skb transmit information * * This structure is placed in skb->cb for three uses: * (1) mac80211 TX control - mac80211 tells the driver what to do * (2) driver internal use (if applicable) * (3) TX status information - driver tells mac80211 what happened * * @flags: transmit info flags, defined above * @band: the band to transmit on (use e.g. for checking for races), * not valid if the interface is an MLD since we won't know which * link the frame will be transmitted on * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC * @status_data: internal data for TX status handling, assigned privately, * see also &enum ieee80211_status_data for the internal documentation * @status_data_idr: indicates status data is IDR allocated ID for ack frame * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try * @control.rts_cts_rate_idx: rate for RTS or CTS * @control.use_rts: use RTS * @control.use_cts_prot: use RTS/CTS * @control.short_preamble: use short preamble (CCK only) * @control.skip_table: skip externally configured rate table * @control.jiffies: timestamp for expiry on powersave clients * @control.vif: virtual interface (may be NULL) * @control.hw_key: key to encrypt with (may be NULL) * @control.flags: control flags, see &enum mac80211_tx_control_flags * @control.enqueue_time: enqueue time (for iTXQs) * @driver_rates: alias to @control.rates to reserve space * @pad: padding * @rate_driver_data: driver use area if driver needs @control.rates * @status: union part for status data * @status.rates: attempted rates * @status.ack_signal: ACK signal * @status.ampdu_ack_len: AMPDU ack length * @status.ampdu_len: AMPDU length * @status.antenna: (legacy, kept only for iwlegacy) * @status.tx_time: airtime consumed for transmission; note this is only * used for WMM AC, not for airtime fairness * @status.flags: status flags, see &enum mac80211_tx_status_flags * @status.status_driver_data: driver use area * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers */ struct ieee80211_tx_info { /* common information */ u32 flags; u32 band:3, status_data_idr:1, status_data:13, hw_queue:4, tx_time_est:10; /* 1 free bit */ union { struct { union { /* rate control */ struct { struct ieee80211_tx_rate rates[ IEEE80211_TX_MAX_RATES]; s8 rts_cts_rate_idx; u8 use_rts:1; u8 use_cts_prot:1; u8 short_preamble:1; u8 skip_table:1; /* for injection only (bitmap) */ u8 antennas:2; /* 14 bits free */ }; /* only needed before rate control */ unsigned long jiffies; }; /* NB: vif can be NULL for injected frames */ struct ieee80211_vif *vif; struct ieee80211_key_conf *hw_key; u32 flags; codel_time_t enqueue_time; } control; struct { u64 cookie; } ack; struct { struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; s32 ack_signal; u8 ampdu_ack_len; u8 ampdu_len; u8 antenna; u8 pad; u16 tx_time; u8 flags; u8 pad2; void *status_driver_data[16 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ IEEE80211_TX_MAX_RATES]; u8 pad[4]; void *rate_driver_data[ IEEE80211_TX_INFO_RATE_DRIVER_DATA_SIZE / sizeof(void *)]; }; void *driver_data[ IEEE80211_TX_INFO_DRIVER_DATA_SIZE / sizeof(void *)]; }; }; static inline u16 ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est) { /* We only have 10 bits in tx_time_est, so store airtime * in increments of 4us and clamp the maximum to 2**12-1 */ info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2; return info->tx_time_est << 2; } static inline u16 ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) { return info->tx_time_est << 2; } /*** * struct ieee80211_rate_status - mrr stage for status path * * This struct is used in struct ieee80211_tx_status to provide drivers a * dynamic way to report about used rates and power levels per packet. * * @rate_idx The actual used rate. * @try_count How often the rate was tried. * @tx_power_idx An idx into the ieee80211_hw->tx_power_levels list of the * corresponding wifi hardware. The idx shall point to the power level * that was used when sending the packet. */ struct ieee80211_rate_status { struct rate_info rate_idx; u8 try_count; u8 tx_power_idx; }; /** * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) * @rates: Mrr stages that were used when sending the packet * @n_rates: Number of mrr stages (count of instances for @rates) * @free_list: list where processed skbs are stored to be free'd by the driver * @ack_hwtstamp: Hardware timestamp of the received ack in nanoseconds * Only needed for Timing measurement and Fine timing measurement action * frames. Only reported by devices that have timestamping enabled. */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; struct ieee80211_rate_status *rates; ktime_t ack_hwtstamp; u8 n_rates; struct list_head *free_list; }; /** * struct ieee80211_scan_ies - descriptors for different blocks of IEs * * This structure is used to point to different blocks of IEs in HW scan * and scheduled scan. These blocks contain the IEs passed by userspace * and the ones generated by mac80211. * * @ies: pointers to band specific IEs. * @len: lengths of band_specific IEs. * @common_ies: IEs for all bands (especially vendor specific ones) * @common_ie_len: length of the common_ies */ struct ieee80211_scan_ies { const u8 *ies[NUM_NL80211_BANDS]; size_t len[NUM_NL80211_BANDS]; const u8 *common_ies; size_t common_ie_len; }; static inline struct ieee80211_tx_info *IEEE80211_SKB_CB(struct sk_buff *skb) { return (struct ieee80211_tx_info *)skb->cb; } static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb) { return (struct ieee80211_rx_status *)skb->cb; } /** * ieee80211_tx_info_clear_status - clear TX status * * @info: The &struct ieee80211_tx_info to be cleared. * * When the driver passes an skb back to mac80211, it must report * a number of things in TX status. This function clears everything * in the TX status but the rate control information (it does clear * the count since you need to fill that in anyway). * * NOTE: While the rates array is kept intact, this will wipe all of the * driver_data fields in info, so it's up to the driver to restore * any fields it needs after calling this helper. */ static inline void ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) { int i; BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != offsetof(struct ieee80211_tx_info, control.rates)); BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != offsetof(struct ieee80211_tx_info, driver_rates)); BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, status.rates) != 8); /* clear the rate counts */ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) info->status.rates[i].count = 0; memset_after(&info->status, 0, rates); } /** * enum mac80211_rx_flags - receive flags * * These flags are used with the @flag member of &struct ieee80211_rx_status. * @RX_FLAG_MMIC_ERROR: Michael MIC error was reported on this frame. * Use together with %RX_FLAG_MMIC_STRIPPED. * @RX_FLAG_DECRYPTED: This frame was decrypted in hardware. * @RX_FLAG_MMIC_STRIPPED: the Michael MIC is stripped off this frame, * verification has been done by the hardware. * @RX_FLAG_IV_STRIPPED: The IV and ICV are stripped from this frame. * If this flag is set, the stack cannot do any replay detection * hence the driver or hardware will have to do that. * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this * flag indicates that the PN was verified for replay protection. * Note that this flag is also currently only supported when a frame * is also decrypted (ie. @RX_FLAG_DECRYPTED must be set) * @RX_FLAG_DUP_VALIDATED: The driver should set this flag if it did * de-duplication by itself. * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime * field) is valid if this field is non-zero, and the position * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS * merging. * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the last symbol of the MPDU * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime * is only for use in the radiotap timestamp header, not otherwise a valid * @mactime value. Note this is a separate flag so that we continue to see * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference * number (@ampdu_reference) must be populated and be a distinct number for * each A-MPDU * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all * subframes of a single A-MPDU * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without * processing it in any regular way. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except * monitor interfaces. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. * @RX_FLAG_AMSDU_MORE: Some drivers may prefer to report separate A-MSDU * subframes instead of a one huge frame for performance reasons. * All, but the last MSDU from an A-MSDU should have this flag set. E.g. * if an A-MSDU has 3 frames, the first 2 must have the flag set, while * the 3rd (last) one must not have this flag set. The flag is used to * deal with retransmission/duplication recovery properly since A-MSDU * subframes share the same sequence number. Reported subframes can be * either regular MSDU or singly A-MSDUs. Subframes must not be * interleaved with other frames. * @RX_FLAG_RADIOTAP_TLV_AT_END: This frame contains radiotap TLVs in the * skb->data (before the 802.11 header). * If used, the SKB's mac_header pointer must be set to point * to the 802.11 header after the TLVs, and any padding added after TLV * data to align to 4 must be cleared by the driver putting the TLVs * in the skb. * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before. * This is used for AMSDU subframes which can have the same PN as * the first subframe. * @RX_FLAG_ICV_STRIPPED: The ICV is stripped from this frame. CRC checking must * be done in the hardware. * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this * frame * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present * (&struct ieee80211_radiotap_he, mac80211 will fill in * * - DATA3_DATA_MCS * - DATA3_DATA_DCM * - DATA3_CODING * - DATA5_GI * - DATA5_DATA_BW_RU_ALLOC * - DATA6_NSTS * - DATA3_STBC * * from the RX info data, so leave those zeroed when building this data) * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present * (&struct ieee80211_radiotap_he_mu) * @RX_FLAG_RADIOTAP_LSIG: L-SIG radiotap data is present * @RX_FLAG_NO_PSDU: use the frame only for radiotap reporting, with * the "0-length PSDU" field included there. The value for it is * in &struct ieee80211_rx_status. Note that if this value isn't * known the frame shouldn't be reported. * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by * hardware or driver) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), RX_FLAG_DUP_VALIDATED = BIT(11), RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), /* one free bit at 15 */ RX_FLAG_MACTIME = BIT(16) | BIT(17), RX_FLAG_MACTIME_PLCP_START = 1 << 16, RX_FLAG_MACTIME_START = 2 << 16, RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), RX_FLAG_MIC_STRIPPED = BIT(21), RX_FLAG_ALLOW_SAME_PN = BIT(22), RX_FLAG_ICV_STRIPPED = BIT(23), RX_FLAG_AMPDU_EOF_BIT = BIT(24), RX_FLAG_AMPDU_EOF_BIT_KNOWN = BIT(25), RX_FLAG_RADIOTAP_HE = BIT(26), RX_FLAG_RADIOTAP_HE_MU = BIT(27), RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), RX_FLAG_8023 = BIT(30), }; /** * enum mac80211_rx_encoding_flags - MCS & bandwidth flags * * @RX_ENC_FLAG_SHORTPRE: Short preamble was used for this frame * @RX_ENC_FLAG_SHORT_GI: Short guard interval was used * @RX_ENC_FLAG_HT_GF: This frame was received in a HT-greenfield transmission, * if the driver fills this value it should add * %IEEE80211_RADIOTAP_MCS_HAVE_FMT * to @hw.radiotap_mcs_details to advertise that fact. * @RX_ENC_FLAG_LDPC: LDPC was used * @RX_ENC_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 * @RX_ENC_FLAG_BF: packet was beamformed */ enum mac80211_rx_encoding_flags { RX_ENC_FLAG_SHORTPRE = BIT(0), RX_ENC_FLAG_SHORT_GI = BIT(2), RX_ENC_FLAG_HT_GF = BIT(3), RX_ENC_FLAG_STBC_MASK = BIT(4) | BIT(5), RX_ENC_FLAG_LDPC = BIT(6), RX_ENC_FLAG_BF = BIT(7), }; #define RX_ENC_FLAG_STBC_SHIFT 4 enum mac80211_rx_encoding { RX_ENC_LEGACY = 0, RX_ENC_HT, RX_ENC_VHT, RX_ENC_HE, RX_ENC_EHT, }; /** * struct ieee80211_rx_status - receive status * * The low-level driver should provide this information (the subset * supported by hardware) to the 802.11 code with each received * frame, in the skb's control buffer (cb). * * @mactime: value in microseconds of the 64-bit Time Synchronization Function * (TSF) timer when the first data symbol (MPDU) arrived at the hardware. * @boottime_ns: CLOCK_BOOTTIME timestamp the frame was received at, this is * needed only for beacons and probe responses that update the scan cache. * @ack_tx_hwtstamp: Hardware timestamp for the ack TX in nanoseconds. Only * needed for Timing measurement and Fine timing measurement action frames. * Only reported by devices that have timestamping enabled. * @device_timestamp: arbitrary timestamp for the device, mac80211 doesn't use * it but can store it and pass it back to the driver for synchronisation * @band: the active band when this frame was received * @freq: frequency the radio was tuned to when receiving this frame, in MHz * This field must be set for management frames, but isn't strictly needed * for data (other) frames - for those it only affects radiotap reporting. * @freq_offset: @freq has a positive offset of 500Khz. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* * @chains: bitmask of receive chains for which separate signal strength * values were filled. * @chain_signal: per-chain signal strength, in dBm (unlike @signal, doesn't * support dB or unspecified units) * @antenna: antenna used * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) * @nss: number of streams (VHT, HE and EHT only) * @flag: %RX_FLAG_\* * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags * @he_ru: HE RU, from &enum nl80211_he_ru_alloc * @he_gi: HE GI, from &enum nl80211_he_gi * @he_dcm: HE DCM value * @eht: EHT specific rate information * @eht.ru: EHT RU, from &enum nl80211_eht_ru_alloc * @eht.gi: EHT GI, from &enum nl80211_eht_gi * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. * @link_id: id of the link used to receive the packet. This is used along with * @link_valid. */ struct ieee80211_rx_status { u64 mactime; union { u64 boottime_ns; ktime_t ack_tx_hwtstamp; }; u32 device_timestamp; u32 ampdu_reference; u32 flag; u16 freq: 13, freq_offset: 1; u8 enc_flags; u8 encoding:3, bw:4; union { struct { u8 he_ru:3; u8 he_gi:2; u8 he_dcm:1; }; struct { u8 ru:4; u8 gi:2; } eht; }; u8 rate_idx; u8 nss; u8 rx_flags; u8 band; u8 antenna; s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; static inline u32 ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status) { return MHZ_TO_KHZ(rx_status->freq) + (rx_status->freq_offset ? 500 : 0); } /** * enum ieee80211_conf_flags - configuration flags * * Flags to define PHY configuration options * * @IEEE80211_CONF_MONITOR: there's a monitor interface present -- use this * to determine for example whether to calculate timestamps for packets * or not, do not use instead of filter flags! * @IEEE80211_CONF_PS: Enable 802.11 power save mode (managed mode only). * This is the power save mode defined by IEEE 802.11-2007 section 11.2, * meaning that the hardware still wakes up for beacons, is able to * transmit frames and receive the possible acknowledgment frames. * Not to be confused with hardware specific wakeup/sleep states, * driver is responsible for that. See the section "Powersave support" * for more. * @IEEE80211_CONF_IDLE: The device is running, but idle; if the flag is set * the driver should be prepared to handle configuration requests but * may turn the device off as much as possible. Typically, this flag will * be set when an interface is set UP but not associated or scanning, but * it can also be unset in that case when monitor interfaces are active. * @IEEE80211_CONF_OFFCHANNEL: The device is currently not on its main * operating channel. */ enum ieee80211_conf_flags { IEEE80211_CONF_MONITOR = (1<<0), IEEE80211_CONF_PS = (1<<1), IEEE80211_CONF_IDLE = (1<<2), IEEE80211_CONF_OFFCHANNEL = (1<<3), }; /** * enum ieee80211_conf_changed - denotes which configuration changed * * @IEEE80211_CONF_CHANGE_LISTEN_INTERVAL: the listen interval changed * @IEEE80211_CONF_CHANGE_MONITOR: the monitor flag changed * @IEEE80211_CONF_CHANGE_PS: the PS flag or dynamic PS timeout changed * @IEEE80211_CONF_CHANGE_POWER: the TX power changed * @IEEE80211_CONF_CHANGE_CHANNEL: the channel/channel_type changed * @IEEE80211_CONF_CHANGE_RETRY_LIMITS: retry limits changed * @IEEE80211_CONF_CHANGE_IDLE: Idle flag changed * @IEEE80211_CONF_CHANGE_SMPS: Spatial multiplexing powersave mode changed * Note that this is only valid if channel contexts are not used, * otherwise each channel context has the number of chains listed. */ enum ieee80211_conf_changed { IEEE80211_CONF_CHANGE_SMPS = BIT(1), IEEE80211_CONF_CHANGE_LISTEN_INTERVAL = BIT(2), IEEE80211_CONF_CHANGE_MONITOR = BIT(3), IEEE80211_CONF_CHANGE_PS = BIT(4), IEEE80211_CONF_CHANGE_POWER = BIT(5), IEEE80211_CONF_CHANGE_CHANNEL = BIT(6), IEEE80211_CONF_CHANGE_RETRY_LIMITS = BIT(7), IEEE80211_CONF_CHANGE_IDLE = BIT(8), }; /** * enum ieee80211_smps_mode - spatial multiplexing power save mode * * @IEEE80211_SMPS_AUTOMATIC: automatic * @IEEE80211_SMPS_OFF: off * @IEEE80211_SMPS_STATIC: static * @IEEE80211_SMPS_DYNAMIC: dynamic * @IEEE80211_SMPS_NUM_MODES: internal, don't use */ enum ieee80211_smps_mode { IEEE80211_SMPS_AUTOMATIC, IEEE80211_SMPS_OFF, IEEE80211_SMPS_STATIC, IEEE80211_SMPS_DYNAMIC, /* keep last */ IEEE80211_SMPS_NUM_MODES, }; /** * struct ieee80211_conf - configuration of the device * * This struct indicates how the driver shall configure the hardware. * * @flags: configuration flags defined above * * @listen_interval: listen interval in units of beacon interval * @ps_dtim_period: The DTIM period of the AP we're connected to, for use * in power saving. Power saving will not be enabled until a beacon * has been received and the DTIM period is known. * @dynamic_ps_timeout: The dynamic powersave timeout (in ms), see the * powersave documentation below. This variable is valid only when * the CONF_PS flag is set. * * @power_level: requested transmit power (in dBm), backward compatibility * value only that is set to the minimum of all interfaces * * @chandef: the channel definition to tune to * @radar_enabled: whether radar detection is enabled * * @long_frame_max_tx_count: Maximum number of transmissions for a "long" frame * (a frame not RTS protected), called "dot11LongRetryLimit" in 802.11, * but actually means the number of transmissions not the number of retries * @short_frame_max_tx_count: Maximum number of transmissions for a "short" * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the * number of transmissions not the number of retries * * @smps_mode: spatial multiplexing powersave mode; note that * %IEEE80211_SMPS_STATIC is used when the device is not * configured for an HT channel. * Note that this is only valid if channel contexts are not used, * otherwise each channel context has the number of chains listed. */ struct ieee80211_conf { u32 flags; int power_level, dynamic_ps_timeout; u16 listen_interval; u8 ps_dtim_period; u8 long_frame_max_tx_count, short_frame_max_tx_count; struct cfg80211_chan_def chandef; bool radar_enabled; enum ieee80211_smps_mode smps_mode; }; /** * struct ieee80211_channel_switch - holds the channel switch data * * The information provided in this structure is required for channel switch * operation. * * @timestamp: value in microseconds of the 64-bit Time Synchronization * Function (TSF) timer when the frame containing the channel switch * announcement was received. This is simply the rx.mactime parameter * the driver passed into mac80211. * @device_timestamp: arbitrary timestamp for the device, this is the * rx.device_timestamp parameter the driver passed to mac80211. * @block_tx: Indicates whether transmission must be blocked before the * scheduled channel switch, as indicated by the AP. * @chandef: the new channel to switch to * @count: the number of TBTT's until the channel switch event * @delay: maximum delay between the time the AP transmitted the last beacon in * current channel and the expected time of the first beacon in the new * channel, expressed in TU. * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO */ struct ieee80211_channel_switch { u64 timestamp; u32 device_timestamp; bool block_tx; struct cfg80211_chan_def chandef; u8 count; u8 link_id; u32 delay; }; /** * enum ieee80211_vif_flags - virtual interface flags * * @IEEE80211_VIF_BEACON_FILTER: the device performs beacon filtering * on this virtual interface to avoid unnecessary CPU wakeups * @IEEE80211_VIF_SUPPORTS_CQM_RSSI: the device can do connection quality * monitoring on this virtual interface -- i.e. it can monitor * connection quality related parameters, such as the RSSI level and * provide notifications if configured trigger levels are reached. * @IEEE80211_VIF_SUPPORTS_UAPSD: The device can do U-APSD for this * interface. This flag should be set during interface addition, * but may be set/cleared as late as authentication to an AP. It is * only valid for managed/station mode interfaces. * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes * and send P2P_PS notification to the driver if NOA changed, even * this is not pure P2P vif. * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is * enabled for the interface. * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA * operation on this interface and request a channel context without * the AP definition. Use this e.g. because the device is able to * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. * @IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC: indicates that the AP sta should * be removed only after setting the vif as unassociated, and not the * opposite. Only relevant for STA vifs. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), IEEE80211_VIF_EML_ACTIVE = BIT(4), IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC = BIT(6), }; /** * enum ieee80211_offload_flags - virtual interface offload flags * * @IEEE80211_OFFLOAD_ENCAP_ENABLED: tx encapsulation offload is enabled * The driver supports sending frames passed as 802.3 frames by mac80211. * It must also support sending 802.11 packets for the same interface. * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload * @IEEE80211_OFFLOAD_DECAP_ENABLED: rx encapsulation offload is enabled * The driver supports passing received 802.11 frames as 802.3 frames to * mac80211. */ enum ieee80211_offload_flags { IEEE80211_OFFLOAD_ENCAP_ENABLED = BIT(0), IEEE80211_OFFLOAD_ENCAP_4ADDR = BIT(1), IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2), }; /** * struct ieee80211_vif_cfg - interface configuration * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS or not * @ibss_creator: indicates if a new IBSS network is being created * @ps: power-save mode (STA only). This flag is NOT affected by * offchannel/dynamic_ps operations. * @aid: association ID number, valid only when @assoc is true * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j. * @eml_med_sync_delay: Medium Synchronization delay as described in * P802.11be_D4.1 Figure 9-1001i. * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1 * Figure 9-1001k * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The * may filter ARP queries targeted for other addresses than listed here. * The driver must allow ARP queries targeted for all address listed here * to pass through. An empty list implies no ARP queries need to pass. * @arp_addr_cnt: Number of addresses currently on the list. Note that this * may be larger than %IEEE80211_BSS_ARP_ADDR_LIST_LEN (the arp_addr_list * array size), it's up to the driver what to do in that case. * @ssid: The SSID of the current vif. Valid in AP and IBSS mode. * @ssid_len: Length of SSID given in @ssid. * @s1g: BSS is S1G BSS (affects Association Request format). * @idle: This interface is idle. There's also a global idle flag in the * hardware config which may be more appropriate depending on what * your driver/device needs to do. * @ap_addr: AP MLD address, or BSSID for non-MLO connections * (station mode only) */ struct ieee80211_vif_cfg { /* association related data */ bool assoc, ibss_joined; bool ibss_creator; bool ps; u16 aid; u16 eml_cap; u16 eml_med_sync_delay; u16 mld_capa_op; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; u8 ssid[IEEE80211_MAX_SSID_LEN]; size_t ssid_len; bool s1g; bool idle; u8 ap_addr[ETH_ALEN] __aligned(2); }; #define IEEE80211_TTLM_NUM_TIDS 8 /** * struct ieee80211_neg_ttlm - negotiated TID to link map info * * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for * this TID is not included. * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this * TID is not included. * @valid: info is valid or not. */ struct ieee80211_neg_ttlm { u16 downlink[IEEE80211_TTLM_NUM_TIDS]; u16 uplink[IEEE80211_TTLM_NUM_TIDS]; bool valid; }; /** * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling * @NEG_TTLM_RES_ACCEPT: accept the request * @NEG_TTLM_RES_REJECT: reject the request * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping */ enum ieee80211_neg_ttlm_res { NEG_TTLM_RES_ACCEPT, NEG_TTLM_RES_REJECT, NEG_TTLM_RES_SUGGEST_PREFERRED }; /** * struct ieee80211_vif - per-interface data * * Data in this structure is continually present for driver * use during the life of a virtual interface. * * @type: type of this virtual interface * @cfg: vif configuration, see &struct ieee80211_vif_cfg * @bss_conf: BSS configuration for this interface, either our own * or the BSS we're associated to * @link_conf: in case of MLD, the per-link BSS configuration, * indexed by link ID * @valid_links: bitmap of valid links, or 0 for non-MLO. * @active_links: The bitmap of active links, or 0 for non-MLO. * The driver shouldn't change this directly, but use the * API calls meant for that purpose. * @dormant_links: subset of the valid links that are disabled/suspended * due to advertised or negotiated TTLM respectively. * 0 for non-MLO. * @suspended_links: subset of dormant_links representing links that are * suspended due to negotiated TTLM, and could be activated in the * future by tearing down the TTLM negotiation. * 0 for non-MLO. * @neg_ttlm: negotiated TID to link mapping info. * see &struct ieee80211_neg_ttlm. * @addr: address of this interface * @addr_valid: indicates if the address is actively used. Set to false for * passive monitor interfaces, true in all other cases. * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively * @netdev_features: tx netdev features supported by the hardware for this * vif. mac80211 initializes this to hw->netdev_features, and the driver * can mask out specific tx features. mac80211 will handle software fixup * for masked offloads (GSO, CSUM) * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed * at runtime, mac80211 will never touch this field * @offload_flags: hardware offload capabilities/flags for this interface. * These are initialized by mac80211 before calling .add_interface, * .change_interface or .update_vif_offload and updated by the driver * within these ops, based on supported features or runtime change * restrictions. * @hw_queue: hardware queue for each AC * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only * @debugfs_dir: debugfs dentry, can be used by drivers to create own per * interface debug files. Note that it will be NULL for the virtual * monitor interface (if that is requested.) * @probe_req_reg: probe requests should be reported to mac80211 for this * interface. * @rx_mcast_action_reg: multicast Action frames should be reported to mac80211 * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links, active_links, dormant_links, suspended_links; struct ieee80211_neg_ttlm neg_ttlm; u8 addr[ETH_ALEN] __aligned(2); bool addr_valid; bool p2p; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; struct ieee80211_txq *txq; netdev_features_t netdev_features; u32 driver_flags; u32 offload_flags; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; #endif bool probe_req_reg; bool rx_mcast_action_reg; struct ieee80211_vif *mbssid_tx_vif; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; /** * ieee80211_vif_usable_links - Return the usable links for the vif * @vif: the vif for which the usable links are requested * Return: the usable link bitmap */ static inline u16 ieee80211_vif_usable_links(const struct ieee80211_vif *vif) { return vif->valid_links & ~vif->dormant_links; } /** * ieee80211_vif_is_mld - Returns true iff the vif is an MLD one * @vif: the vif * Return: %true if the vif is an MLD, %false otherwise. */ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif) { /* valid_links != 0 indicates this vif is an MLD */ return vif->valid_links != 0; } /** * ieee80211_vif_link_active - check if a given link is active * @vif: the vif * @link_id: the link ID to check * Return: %true if the vif is an MLD and the link is active, or if * the vif is not an MLD and the link ID is 0; %false otherwise. */ static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif, unsigned int link_id) { if (!ieee80211_vif_is_mld(vif)) return link_id == 0; return vif->active_links & BIT(link_id); } #define for_each_vif_active_link(vif, link, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ (link = link_conf_dereference_check(vif, link_id))) static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) { #ifdef CONFIG_MAC80211_MESH return vif->type == NL80211_IFTYPE_MESH_POINT; #endif return false; } /** * wdev_to_ieee80211_vif - return a vif struct from a wdev * @wdev: the wdev to get the vif for * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that get a wdev. * * Return: pointer to the wdev, or %NULL if the given wdev isn't * associated with a vif that the driver knows about (e.g. monitor * or AP_VLAN interfaces.) */ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); /** * ieee80211_vif_to_wdev - return a wdev struct from a vif * @vif: the vif to get the wdev for * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. * This can also be useful to get the netdev associated to a vif. * * Return: pointer to the wdev */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) { return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->wiphy->mtx); } #define link_conf_dereference_protected(vif, link_id) \ rcu_dereference_protected((vif)->link_conf[link_id], \ lockdep_vif_wiphy_mutex_held(vif)) #define link_conf_dereference_check(vif, link_id) \ rcu_dereference_check((vif)->link_conf[link_id], \ lockdep_vif_wiphy_mutex_held(vif)) /** * enum ieee80211_key_flags - key flags * * These flags are used for communication about keys between the driver * and mac80211, with the @flags parameter of &struct ieee80211_key_conf. * * @IEEE80211_KEY_FLAG_GENERATE_IV: This flag should be set by the * driver to indicate that it requires IV generation for this * particular key. Setting this flag does not necessarily mean that SKBs * will have sufficient tailroom for ICV or MIC. * @IEEE80211_KEY_FLAG_GENERATE_MMIC: This flag should be set by * the driver for a TKIP key if it requires Michael MIC * generation in software. * @IEEE80211_KEY_FLAG_PAIRWISE: Set by mac80211, this flag indicates * that the key is pairwise rather then a shared key. * @IEEE80211_KEY_FLAG_SW_MGMT_TX: This flag should be set by the driver for a * CCMP/GCMP key if it requires CCMP/GCMP encryption of management frames * (MFP) to be done in software. * @IEEE80211_KEY_FLAG_PUT_IV_SPACE: This flag should be set by the driver * if space should be prepared for the IV, but the IV * itself should not be generated. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_IV on the same key. Setting this flag does * not necessarily mean that SKBs will have sufficient tailroom for ICV or * MIC. * @IEEE80211_KEY_FLAG_RX_MGMT: This key will be used to decrypt received * management frames. The flag can help drivers that have a hardware * crypto implementation that doesn't deal with management frames * properly by allowing them to not upload the keys to hardware and * fall back to software crypto. Note that this flag deals only with * RX, if your crypto engine can't deal with TX you can also set the * %IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW. * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the * driver for a CCMP/GCMP key to indicate that is requires IV generation * only for management frames (MFP). * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the * driver for a key to indicate that sufficient tailroom must always * be reserved for ICV or MIC, even when HW encryption is enabled. * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for * a TKIP key if it only requires MIC space. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver * for a AES_CMAC or a AES_GMAC key to indicate that it requires sequence * number generation only * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key * (set by mac80211 from the sta->spp_amsdu flag) */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), IEEE80211_KEY_FLAG_GENERATE_IV = BIT(1), IEEE80211_KEY_FLAG_GENERATE_MMIC = BIT(2), IEEE80211_KEY_FLAG_PAIRWISE = BIT(3), IEEE80211_KEY_FLAG_SW_MGMT_TX = BIT(4), IEEE80211_KEY_FLAG_PUT_IV_SPACE = BIT(5), IEEE80211_KEY_FLAG_RX_MGMT = BIT(6), IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), IEEE80211_KEY_FLAG_GENERATE_MMIE = BIT(10), IEEE80211_KEY_FLAG_SPP_AMSDU = BIT(11), }; /** * struct ieee80211_key_conf - key information * * This key information is given by mac80211 to the driver by * the set_key() callback in &struct ieee80211_ops. * * @hw_key_idx: To be set by the driver, this is the key index the driver * wants to be given when a frame is transmitted and needs to be * encrypted in hardware. * @cipher: The key's cipher suite selector. * @tx_pn: PN used for TX keys, may be used by the driver as well if it * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. * @keyidx: the key index (0-7) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) * data block: * - Temporal Encryption Key (128 bits) * - Temporal Authenticator Tx MIC Key (64 bits) * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type * @link_id: the link ID, 0 for non-MLO, or -1 for pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; u32 cipher; u8 icv_len; u8 iv_len; u8 hw_key_idx; s8 keyidx; u16 flags; s8 link_id; u8 keylen; u8 key[]; }; #define IEEE80211_MAX_PN_LEN 16 #define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff)) #define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff)) /** * struct ieee80211_key_seq - key sequence counter * * @tkip: TKIP data, containing IV32 and IV16 in host byte order * @ccmp: PN data, most significant byte first (big endian, * reverse order than in packet) * @aes_cmac: PN data, most significant byte first (big endian, * reverse order than in packet) * @aes_gmac: PN data, most significant byte first (big endian, * reverse order than in packet) * @gcmp: PN data, most significant byte first (big endian, * reverse order than in packet) * @hw: data for HW-only (e.g. cipher scheme) keys */ struct ieee80211_key_seq { union { struct { u32 iv32; u16 iv16; } tkip; struct { u8 pn[6]; } ccmp; struct { u8 pn[6]; } aes_cmac; struct { u8 pn[6]; } aes_gmac; struct { u8 pn[6]; } gcmp; struct { u8 seq[IEEE80211_MAX_PN_LEN]; u8 seq_len; } hw; }; }; /** * enum set_key_cmd - key command * * Used with the set_key() callback in &struct ieee80211_ops, this * indicates whether a key is being removed or added. * * @SET_KEY: a key is set * @DISABLE_KEY: a key must be disabled */ enum set_key_cmd { SET_KEY, DISABLE_KEY, }; /** * enum ieee80211_sta_state - station state * * @IEEE80211_STA_NOTEXIST: station doesn't exist at all, * this is a special state for add/remove transitions * @IEEE80211_STA_NONE: station exists without special state * @IEEE80211_STA_AUTH: station is authenticated * @IEEE80211_STA_ASSOC: station is associated * @IEEE80211_STA_AUTHORIZED: station is authorized (802.1X) */ enum ieee80211_sta_state { /* NOTE: These need to be ordered correctly! */ IEEE80211_STA_NOTEXIST, IEEE80211_STA_NONE, IEEE80211_STA_AUTH, IEEE80211_STA_ASSOC, IEEE80211_STA_AUTHORIZED, }; /** * enum ieee80211_sta_rx_bandwidth - station RX bandwidth * @IEEE80211_STA_RX_BW_20: station can only receive 20 MHz * @IEEE80211_STA_RX_BW_40: station can receive up to 40 MHz * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz * (including 80+80 MHz) * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz * * Implementation note: 20 must be zero to be initialized * correctly, the values must be sorted. */ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_20 = 0, IEEE80211_STA_RX_BW_40, IEEE80211_STA_RX_BW_80, IEEE80211_STA_RX_BW_160, IEEE80211_STA_RX_BW_320, }; #define IEEE80211_STA_RX_BW_MAX IEEE80211_STA_RX_BW_320 /** * struct ieee80211_sta_rates - station rate selection table * * @rcu_head: RCU head used for freeing the table on update * @rate: transmit rates/flags to be used by default. * Overriding entries per-packet is possible by using cb tx control. */ struct ieee80211_sta_rates { struct rcu_head rcu_head; struct { s8 idx; u8 count; u8 count_cts; u8 count_rts; u16 flags; } rate[IEEE80211_TX_RATE_TABLE_SIZE]; }; /** * struct ieee80211_sta_txpwr - station txpower configuration * * Used to configure txpower for station. * * @power: indicates the tx power, in dBm, to be used when sending data frames * to the STA. * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power * will be less than or equal to specified from userspace, whereas if TPC * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. * NL80211_TX_POWER_FIXED is not a valid configuration option for * per peer TPC. */ struct ieee80211_sta_txpwr { s16 power; enum nl80211_tx_power_setting type; }; /** * struct ieee80211_sta_aggregates - info that is aggregated from active links * * Used for any per-link data that needs to be aggregated and updated in the * main &struct ieee80211_sta when updated or the active links change. * * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. * This field is always valid for packets with a VHT preamble. * For packets with a HT preamble, additional limits apply: * * * If the skb is transmitted as part of a BA agreement, the * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. * * If the skb is not part of a BA agreement, the A-MSDU maximal * size is min(max_amsdu_len, 7935) bytes. * * Both additional HT limits must be enforced by the low level * driver. This is defined by the spec (IEEE 802.11-2012 section * 8.3.2.2 NOTE 2). * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID */ struct ieee80211_sta_aggregates { u16 max_amsdu_len; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; }; /** * struct ieee80211_link_sta - station Link specific info * All link specific info for a STA link for a non MLD STA(single) * or a MLD STA(multiple entries) are stored here. * * @sta: reference to owning STA * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) * @link_id: the link ID for this link STA (0 for deflink) * @smps_mode: current SMPS mode (off, static or dynamic) * @supp_rates: Bitmap of supported rates * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the * station can receive at the moment, changed by operating mode * notifications and capabilities. The value is only valid after * the station moves to associated state. * @txpwr: the station tx power configuration * */ struct ieee80211_link_sta { struct ieee80211_sta *sta; u8 addr[ETH_ALEN]; u8 link_id; enum ieee80211_smps_mode smps_mode; u32 supp_rates[NUM_NL80211_BANDS]; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; struct ieee80211_sta_aggregates agg; u8 rx_nss; enum ieee80211_sta_rx_bandwidth bandwidth; struct ieee80211_sta_txpwr txpwr; }; /** * struct ieee80211_sta - station table entry * * A station table entry represents a station we are possibly * communicating with. Since stations are RCU-managed in * mac80211, any ieee80211_sta pointer you get access to must * either be protected by rcu_read_lock() explicitly or implicitly, * or you must take good care to not use such a pointer after a * call to your sta_remove callback that removed it. * This also represents the MLD STA in case of MLO association * and holds pointers to various link STA's * * @addr: MAC address * @aid: AID we assigned to the station if we're an AP * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. * @wme: indicates whether the STA supports QoS/WME (if local devices does, * otherwise always false) * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. * @mfp: indicates whether the STA uses management frame protection or not. * @mlo: indicates whether the STA is MLO station. * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS) * is used for non-data frames * @deflink: This holds the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA * the first added link STA will point to deflink. * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, * i.e link[0] all links would be assigned to NULL by default and * would access link information via @deflink or link[0]. For MLO * STA, first link STA being added will point its link pointer to * @deflink address and remaining would be allocated and the address * would be assigned to link[link_id] where link_id is the id assigned * by the AP. * @valid_links: bitmap of valid links, or 0 for non-MLO * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. */ struct ieee80211_sta { u8 addr[ETH_ALEN] __aligned(2); u16 aid; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; bool mfp; bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; struct ieee80211_sta_aggregates *cur; bool support_p2p_ps; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; u16 valid_links; struct ieee80211_link_sta deflink; struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta); #else static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { return true; } #endif #define link_sta_dereference_protected(sta, link_id) \ rcu_dereference_protected((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) #define link_sta_dereference_check(sta, link_id) \ rcu_dereference_check((sta)->link[link_id], \ lockdep_sta_mutex_held(sta)) #define for_each_sta_active_link(vif, sta, link_sta, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ ((link_sta) = link_sta_dereference_check(sta, link_id))) /** * enum sta_notify_cmd - sta notify command * * Used with the sta_notify() callback in &struct ieee80211_ops, this * indicates if an associated station made a power state transition. * * @STA_NOTIFY_SLEEP: a station is now sleeping * @STA_NOTIFY_AWAKE: a sleeping station woke up */ enum sta_notify_cmd { STA_NOTIFY_SLEEP, STA_NOTIFY_AWAKE, }; /** * struct ieee80211_tx_control - TX control data * * @sta: station table entry, this sta pointer may be NULL and * it is not allowed to copy the pointer, due to RCU. */ struct ieee80211_tx_control { struct ieee80211_sta *sta; }; /** * struct ieee80211_txq - Software intermediate tx queue * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @sta: station table entry, %NULL for per-vif queue * @tid: the TID for this queue (unused for per-vif queue), * %IEEE80211_NUM_TIDS for non-data (if enabled) * @ac: the AC for this queue * @drv_priv: driver private area, sized by hw->txq_data_size * * The driver can obtain packets from this queue by calling * ieee80211_tx_dequeue(). */ struct ieee80211_txq { struct ieee80211_vif *vif; struct ieee80211_sta *sta; u8 tid; u8 ac; /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; /** * enum ieee80211_hw_flags - hardware flags * * These flags are used to indicate hardware capabilities to * the stack. Generally, flags here should have their meaning * done in a way that the simplest hardware doesn't need setting * any particular flags. There are some exceptions to this rule, * however, so you are advised to review these flags carefully. * * @IEEE80211_HW_HAS_RATE_CONTROL: * The hardware or firmware includes rate control, and cannot be * controlled by the stack. As such, no rate control algorithm * should be instantiated, and the TX rate reported to userspace * will be taken from the TX status instead of the rate control * algorithm. * Note that this requires that the driver implement a number of * callbacks so it has the correct information, it needs to have * the @set_rts_threshold callback and must look at the BSS config * @use_cts_prot for G/N protection, @use_short_slot for slot * timing in 2.4 GHz and @use_short_preamble for preambles for * CCK frames. * * @IEEE80211_HW_RX_INCLUDES_FCS: * Indicates that received frames passed to the stack include * the FCS at the end. * * @IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING: * Some wireless LAN chipsets buffer broadcast/multicast frames * for power saving stations in the hardware/firmware and others * rely on the host system for such buffering. This option is used * to configure the IEEE 802.11 upper layer to buffer broadcast and * multicast frames when there are power saving stations so that * the driver can fetch them with ieee80211_get_buffered_bc(). * * @IEEE80211_HW_SIGNAL_UNSPEC: * Hardware can provide signal values but we don't know its units. We * expect values between 0 and @max_signal. * If possible please provide dB or dBm instead. * * @IEEE80211_HW_SIGNAL_DBM: * Hardware gives signal values in dBm, decibel difference from * one milliwatt. This is the preferred method since it is standardized * between different devices. @max_signal does not need to be set. * * @IEEE80211_HW_SPECTRUM_MGMT: * Hardware supports spectrum management defined in 802.11h * Measurement, Channel Switch, Quieting, TPC * * @IEEE80211_HW_AMPDU_AGGREGATION: * Hardware supports 11n A-MPDU aggregation. * * @IEEE80211_HW_SUPPORTS_PS: * Hardware has power save support (i.e. can go to sleep). * * @IEEE80211_HW_PS_NULLFUNC_STACK: * Hardware requires nullfunc frame handling in stack, implies * stack support for dynamic PS. * * @IEEE80211_HW_SUPPORTS_DYNAMIC_PS: * Hardware has support for dynamic PS. * * @IEEE80211_HW_MFP_CAPABLE: * Hardware supports management frame protection (MFP, IEEE 802.11w). * * @IEEE80211_HW_REPORTS_TX_ACK_STATUS: * Hardware can provide ack status reports of Tx frames to * the stack. * * @IEEE80211_HW_CONNECTION_MONITOR: * The hardware performs its own connection monitoring, including * periodic keep-alives to the AP and probing the AP on beacon loss. * * @IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC: * This device needs to get data from beacon before association (i.e. * dtim_period). * * @IEEE80211_HW_SUPPORTS_PER_STA_GTK: The device's crypto engine supports * per-station GTKs as used by IBSS RSN or during fast transition. If * the device doesn't support per-station GTKs, but can be asked not * to decrypt group addressed frames, then IBSS RSN support is still * possible but software crypto will be used. Advertise the wiphy flag * only in that case. * * @IEEE80211_HW_AP_LINK_PS: When operating in AP mode the device * autonomously manages the PS status of connected stations. When * this flag is set mac80211 will not trigger PS mode for connected * stations based on the PM bit of incoming frames. * Use ieee80211_start_ps()/ieee8021_end_ps() to manually configure * the PS mode of connected stations. * * @IEEE80211_HW_TX_AMPDU_SETUP_IN_HW: The device handles TX A-MPDU session * setup strictly in HW. mac80211 should not attempt to do this in * software. * * @IEEE80211_HW_WANT_MONITOR_VIF: The driver would like to be informed of * a virtual monitor interface when monitor interfaces are the only * active interfaces. * * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed * of any monitor interface, as well as their configured channel. * This is useful for supporting multiple monitor interfaces on different * channels. * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). * * @IEEE80211_HW_SW_CRYPTO_CONTROL: The driver wants to control which of the * crypto algorithms can be done in software - so don't automatically * try to fall back to it if hardware crypto fails, but do so only if * the driver returns 1. This also forces the driver to advertise its * supported cipher suites. * * @IEEE80211_HW_SUPPORT_FAST_XMIT: The driver/hardware supports fast-xmit, * this currently requires only the ability to calculate the duration * for frames. * * @IEEE80211_HW_QUEUE_CONTROL: The driver wants to control per-interface * queue mapping in order to use different queues (not just one per AC) * for different virtual interfaces. See the doc section on HW queue * control for more details. * * @IEEE80211_HW_SUPPORTS_RC_TABLE: The driver supports using a rate * selection table provided by the rate control algorithm. * * @IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF: Use the P2P Device address for any * P2P Interface. This will be honoured even if more than one interface * is supported. * * @IEEE80211_HW_TIMING_BEACON_ONLY: Use sync timing from beacon frames * only, to allow getting TBTT of a DTIM beacon. * * @IEEE80211_HW_SUPPORTS_HT_CCK_RATES: Hardware supports mixing HT/CCK rates * and can cope with CCK rates in an aggregation session (e.g. by not * using aggregation for such frames.) * * @IEEE80211_HW_CHANCTX_STA_CSA: Support 802.11h based channel-switch (CSA) * for a single active channel while using channel contexts. When support * is not enabled the default action is to disconnect when getting the * CSA frame. * * @IEEE80211_HW_SUPPORTS_CLONED_SKBS: The driver will never modify the payload * or tailroom of TX skbs without copying them first. * * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. * * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth * than then BSS bandwidth for a TDLS link on the base channel. * * @IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU: The driver supports receiving A-MSDUs * within A-MPDU. * * @IEEE80211_HW_BEACON_TX_STATUS: The device/driver provides TX status * for sent beacons. * * @IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR: Hardware (or driver) requires that each * station has a unique address, i.e. each station entry can be identified * by just its MAC address; this prevents, for example, the same station * from connecting to two virtual AP interfaces at the same time. * * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the * reordering buffer internally, guaranteeing mac80211 receives frames in * order and does not need to manage its own reorder buffer or BA session * timeout. * * @IEEE80211_HW_USES_RSS: The device uses RSS and thus requires parallel RX, * which implies using per-CPU station statistics. * * @IEEE80211_HW_TX_AMSDU: Hardware (or driver) supports software aggregated * A-MSDU frames. Requires software tx queueing and fast-xmit support. * When not using minstrel/minstrel_ht rate control, the driver must * limit the maximum A-MSDU size based on the current tx rate by setting * max_rc_amsdu_len in struct ieee80211_sta. * * @IEEE80211_HW_TX_FRAG_LIST: Hardware (or driver) supports sending frag_list * skbs, needed for zero-copy software A-MSDU. * * @IEEE80211_HW_REPORTS_LOW_ACK: The driver (or firmware) reports low ack event * by ieee80211_report_low_ack() based on its own algorithm. For such * drivers, mac80211 packet loss mechanism will not be triggered and driver * is completely depending on firmware event for station kickout. * * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself. * The stack will not do fragmentation. * The callback for @set_frag_threshold should be set as well. * * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * * @IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP: The driver (or firmware) doesn't * support QoS NDP for AP probing - that's most likely a driver bug. * * @IEEE80211_HW_BUFF_MMPDU_TXQ: use the TXQ for bufferable MMPDUs, this of * course requires the driver to use TXQs to start with. * * @IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW: (Hardware) rate control supports VHT * extended NSS BW (dot11VHTExtendedNSSBWCapable). This flag will be set if * the selected rate control algorithm sets %RATE_CTRL_CAPA_VHT_EXT_NSS_BW * but if the rate control is built-in then it must be set by the driver. * See also the documentation for that flag. * * @IEEE80211_HW_STA_MMPDU_TXQ: use the extra non-TID per-station TXQ for all * MMPDUs on station interfaces. This of course requires the driver to use * TXQs to start with. * * @IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN: Driver does not report accurate A-MPDU * length in tx status information * * @IEEE80211_HW_SUPPORTS_MULTI_BSSID: Hardware supports multi BSSID * * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * * @IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT: The card and driver is only * aggregating MPDUs with the same keyid, allowing mac80211 to keep Tx * A-MPDU sessions active while rekeying with Extended Key ID. * * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation * offload * * @IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD: Hardware supports rx decapsulation * offload * * @IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP: Hardware supports concurrent rx * decapsulation offload and passing raw 802.11 frames for monitor iface. * If this is supported, the driver must pass both 802.3 frames for real * usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to * the stack. * * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color * collision detection and doesn't need it in software. * * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in * EHT in 5 GHz and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that * implements MLO, so operation can continue on other links when one * link is switching. * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { IEEE80211_HW_HAS_RATE_CONTROL, IEEE80211_HW_RX_INCLUDES_FCS, IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING, IEEE80211_HW_SIGNAL_UNSPEC, IEEE80211_HW_SIGNAL_DBM, IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC, IEEE80211_HW_SPECTRUM_MGMT, IEEE80211_HW_AMPDU_AGGREGATION, IEEE80211_HW_SUPPORTS_PS, IEEE80211_HW_PS_NULLFUNC_STACK, IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, IEEE80211_HW_REPORTS_TX_ACK_STATUS, IEEE80211_HW_CONNECTION_MONITOR, IEEE80211_HW_QUEUE_CONTROL, IEEE80211_HW_SUPPORTS_PER_STA_GTK, IEEE80211_HW_AP_LINK_PS, IEEE80211_HW_TX_AMPDU_SETUP_IN_HW, IEEE80211_HW_SUPPORTS_RC_TABLE, IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF, IEEE80211_HW_TIMING_BEACON_ONLY, IEEE80211_HW_SUPPORTS_HT_CCK_RATES, IEEE80211_HW_CHANCTX_STA_CSA, IEEE80211_HW_SUPPORTS_CLONED_SKBS, IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS, IEEE80211_HW_TDLS_WIDER_BW, IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, IEEE80211_HW_SUPPORTS_REORDERING_BUFFER, IEEE80211_HW_USES_RSS, IEEE80211_HW_TX_AMSDU, IEEE80211_HW_TX_FRAG_LIST, IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, IEEE80211_HW_DOESNT_SUPPORT_QOS_NDP, IEEE80211_HW_BUFF_MMPDU_TXQ, IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, IEEE80211_HW_STA_MMPDU_TXQ, IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS }; /** * struct ieee80211_hw - hardware information and state * * This structure contains the configuration and hardware * information for an 802.11 PHY. * * @wiphy: This points to the &struct wiphy allocated for this * 802.11 PHY. You must fill in the @perm_addr and @dev * members of this structure using SET_IEEE80211_DEV() * and SET_IEEE80211_PERM_ADDR(). Additionally, all supported * bands (with channels, bitrates) are registered here. * * @conf: &struct ieee80211_conf, device configuration, don't use. * * @priv: pointer to private area that was allocated for driver use * along with this structure. * * @flags: hardware flags, see &enum ieee80211_hw_flags. * * @extra_tx_headroom: headroom to reserve in each transmit skb * for use by the driver (e.g. for transmit headers.) * * @extra_beacon_tailroom: tailroom to reserve in each beacon tx skb. * Can be used by drivers to add extra IEs. * * @max_signal: Maximum value for signal (rssi) in RX information, used * only when @IEEE80211_HW_SIGNAL_UNSPEC or @IEEE80211_HW_SIGNAL_DB * * @max_listen_interval: max listen interval in units of beacon interval * that HW supports * * @queues: number of available hardware transmit queues for * data packets. WMM/QoS requires at least four, these * queues need to have configurable access parameters. * * @rate_control_algorithm: rate control algorithm for this hardware. * If unset (NULL), the default algorithm will be used. Must be * set before calling ieee80211_register_hw(). * * @vif_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_vif. * @sta_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_sta. * @chanctx_data_size: size (in bytes) of the drv_priv data area * within &struct ieee80211_chanctx_conf. * @txq_data_size: size (in bytes) of the drv_priv data area * within @struct ieee80211_txq. * * @max_rates: maximum number of alternate rate retry stages the hw * can handle. * @max_report_rates: maximum number of alternate rate retry stages * the hw can report back. * @max_rate_tries: maximum number of tries for each stage * * @max_rx_aggregation_subframes: maximum buffer size (number of * sub-frames) to be used for A-MPDU block ack receiver * aggregation. * This is only relevant if the device has restrictions on the * number of subframes, if it relies on mac80211 to do reordering * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an * aggregate an HT/HE device will transmit. In HT AddBA we'll * advertise a constant value of 64 as some older APs crash if * the window size is smaller (an example is LinkSys WRT120N * with FW v1.0.07 build 002 Jun 18 2012). * For AddBA to HE capable peers this value will be used. * * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum * of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list. * * @offchannel_tx_hw_queue: HW queue ID to use for offchannel TX * (if %IEEE80211_HW_QUEUE_CONTROL is set) * * @radiotap_mcs_details: lists which MCS information can the HW * reports, by default it is set to _MCS, _GI and _BW but doesn't * include _FMT. Use %IEEE80211_RADIOTAP_MCS_HAVE_\* values, only * adding _BW is supported today. * * @radiotap_vht_details: lists which VHT MCS information the HW reports, * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status * device_timestamp. * @radiotap_timestamp.units_pos: Must be set to a combination of a * IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a * IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value. * @radiotap_timestamp.accuracy: If non-negative, fills the accuracy in the * radiotap field and the accuracy known flag will be set. * * @netdev_features: netdev features to be set in each netdev created * from this HW. Note that not all features are usable with mac80211, * other features will be rejected during HW registration. * * @uapsd_queues: This bitmap is included in (re)association frame to indicate * for each access category if it is uAPSD trigger-enabled and delivery- * enabled. Use IEEE80211_WMM_IE_STA_QOSINFO_AC_* to set this bitmap. * Each bit corresponds to different AC. Value '1' in specific bit means * that corresponding AC is both trigger- and delivery-enabled. '0' means * neither enabled. * * @uapsd_max_sp_len: maximum number of total buffered frames the WMM AP may * deliver to a WMM STA during any Service Period triggered by the WMM STA. * Use IEEE80211_WMM_IE_STA_QOSINFO_SP_* for correct values. * * @max_nan_de_entries: maximum number of NAN DE functions supported by the * device. * * @tx_sk_pacing_shift: Pacing shift to set on TCP sockets when frames from * them are encountered. The default should typically not be changed, * unless the driver has good reasons for needing more buffers. * * @weight_multiplier: Driver specific airtime weight multiplier used while * refilling deficit of each TXQ. * * @max_mtu: the max mtu could be set. * * @tx_power_levels: a list of power levels supported by the wifi hardware. * The power levels can be specified either as integer or fractions. * The power level at idx 0 shall be the maximum positive power level. * * @max_txpwr_levels_idx: the maximum valid idx of 'tx_power_levels' list. */ struct ieee80211_hw { struct ieee80211_conf conf; struct wiphy *wiphy; const char *rate_control_algorithm; void *priv; unsigned long flags[BITS_TO_LONGS(NUM_IEEE80211_HW_FLAGS)]; unsigned int extra_tx_headroom; unsigned int extra_beacon_tailroom; int vif_data_size; int sta_data_size; int chanctx_data_size; int txq_data_size; u16 queues; u16 max_listen_interval; s8 max_signal; u8 max_rates; u8 max_report_rates; u8 max_rate_tries; u16 max_rx_aggregation_subframes; u16 max_tx_aggregation_subframes; u8 max_tx_fragments; u8 offchannel_tx_hw_queue; u8 radiotap_mcs_details; u16 radiotap_vht_details; struct { int units_pos; s16 accuracy; } radiotap_timestamp; netdev_features_t netdev_features; u8 uapsd_queues; u8 uapsd_max_sp_len; u8 max_nan_de_entries; u8 tx_sk_pacing_shift; u8 weight_multiplier; u32 max_mtu; const s8 *tx_power_levels; u8 max_txpwr_levels_idx; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, enum ieee80211_hw_flags flg) { return test_bit(flg, hw->flags); } #define ieee80211_hw_check(hw, flg) _ieee80211_hw_check(hw, IEEE80211_HW_##flg) static inline void _ieee80211_hw_set(struct ieee80211_hw *hw, enum ieee80211_hw_flags flg) { return __set_bit(flg, hw->flags); } #define ieee80211_hw_set(hw, flg) _ieee80211_hw_set(hw, IEEE80211_HW_##flg) /** * struct ieee80211_scan_request - hw scan request * * @ies: pointers different parts of IEs (in req.ie) * @req: cfg80211 request. */ struct ieee80211_scan_request { struct ieee80211_scan_ies ies; /* Keep last */ struct cfg80211_scan_request req; }; /** * struct ieee80211_tdls_ch_sw_params - TDLS channel switch parameters * * @sta: peer this TDLS channel-switch request/response came from * @chandef: channel referenced in a TDLS channel-switch request * @action_code: see &enum ieee80211_tdls_actioncode * @status: channel-switch response status * @timestamp: time at which the frame was received * @switch_time: switch-timing parameter received in the frame * @switch_timeout: switch-timing parameter received in the frame * @tmpl_skb: TDLS switch-channel response template * @ch_sw_tm_ie: offset of the channel-switch timing IE inside @tmpl_skb */ struct ieee80211_tdls_ch_sw_params { struct ieee80211_sta *sta; struct cfg80211_chan_def *chandef; u8 action_code; u32 status; u32 timestamp; u16 switch_time; u16 switch_timeout; struct sk_buff *tmpl_skb; u32 ch_sw_tm_ie; }; /** * wiphy_to_ieee80211_hw - return a mac80211 driver hw struct from a wiphy * * @wiphy: the &struct wiphy which we want to query * * mac80211 drivers can use this to get to their respective * &struct ieee80211_hw. Drivers wishing to get to their own private * structure can then access it via hw->priv. Note that mac802111 drivers should * not use wiphy_priv() to try to get their private driver structure as this * is already used internally by mac80211. * * Return: The mac80211 driver hw struct of @wiphy. */ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy); /** * SET_IEEE80211_DEV - set device for 802.11 hardware * * @hw: the &struct ieee80211_hw to set the device for * @dev: the &struct device of this 802.11 device */ static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev) { set_wiphy_dev(hw->wiphy, dev); } /** * SET_IEEE80211_PERM_ADDR - set the permanent MAC address for 802.11 hardware * * @hw: the &struct ieee80211_hw to set the MAC address for * @addr: the address to set */ static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr) { memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN); } static inline struct ieee80211_rate * ieee80211_get_tx_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c) { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } static inline struct ieee80211_rate * ieee80211_get_rts_cts_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c) { if (c->control.rts_cts_rate_idx < 0) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rts_cts_rate_idx]; } static inline struct ieee80211_rate * ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, const struct ieee80211_tx_info *c, int idx) { if (c->control.rates[idx + 1].idx < 0) return NULL; return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[idx + 1].idx]; } /** * ieee80211_free_txskb - free TX skb * @hw: the hardware * @skb: the skb * * Free a transmit skb. Use this function when some failure * to transmit happened and thus status cannot be reported. */ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_purge_tx_queue - purge TX skb queue * @hw: the hardware * @skbs: the skbs * * Free a set of transmit skbs. Use this function when device is going to stop * but some transmit skbs without TX status are still queued. * This function does not take the list lock and the caller must hold the * relevant locks to use it. */ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); /** * DOC: Hardware crypto acceleration * * mac80211 is capable of taking advantage of many hardware * acceleration designs for encryption and decryption operations. * * The set_key() callback in the &struct ieee80211_ops for a given * device is called to enable hardware acceleration of encryption and * decryption. The callback takes a @sta parameter that will be NULL * for default keys or keys used for transmission only, or point to * the station information for the peer for individual keys. * Multiple transmission keys with the same key index may be used when * VLANs are configured for an access point. * * When transmitting, the TX control data will use the @hw_key_idx * selected by the driver by modifying the &struct ieee80211_key_conf * pointed to by the @key parameter to the set_key() function. * * The set_key() call for the %SET_KEY command should return 0 if * the key is now in use, -%EOPNOTSUPP or -%ENOSPC if it couldn't be * added; if you return 0 then hw_key_idx must be assigned to the * hardware key index. You are free to use the full u8 range. * * Note that in the case that the @IEEE80211_HW_SW_CRYPTO_CONTROL flag is * set, mac80211 will not automatically fall back to software crypto if * enabling hardware crypto failed. The set_key() call may also return the * value 1 to permit this specific key/algorithm to be done in software. * * When the cmd is %DISABLE_KEY then it must succeed. * * Note that it is permissible to not decrypt a frame even if a key * for it has been uploaded to hardware. The stack will not make any * decision based on whether a key has been uploaded or not but rather * based on the receive flags. * * The &struct ieee80211_key_conf structure pointed to by the @key * parameter is guaranteed to be valid until another call to set_key() * removes it, but it can only be used as a cookie to differentiate * keys. * * In TKIP some HW need to be provided a phase 1 key, for RX decryption * acceleration (i.e. iwlwifi). Those drivers should provide update_tkip_key * handler. * The update_tkip_key() call updates the driver with the new phase 1 key. * This happens every time the iv16 wraps around (every 65536 packets). The * set_key() call will happen only once for each key (unless the AP did * rekeying); it will not include a valid phase 1 key. The valid phase 1 key is * provided by update_tkip_key only. The trigger that makes mac80211 call this * handler is software decryption with wrap around of iv16. * * The set_default_unicast_key() call updates the default WEP key index * configured to the hardware for WEP encryption type. This is required * for devices that support offload of data packets (e.g. ARP responses). * * Mac80211 drivers should set the @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 flag * when they are able to replace in-use PTK keys according to the following * requirements: * 1) They do not hand over frames decrypted with the old key to mac80211 once the call to set_key() with command %DISABLE_KEY has been completed, 2) either drop or continue to use the old key for any outgoing frames queued at the time of the key deletion (including re-transmits), 3) never send out a frame queued prior to the set_key() %SET_KEY command encrypted with the new key when also needing @IEEE80211_KEY_FLAG_GENERATE_IV and 4) never send out a frame unencrypted when it should be encrypted. Mac80211 will not queue any new frames for a deleted key to the driver. */ /** * DOC: Powersave support * * mac80211 has support for various powersave implementations. * * First, it can support hardware that handles all powersaving by itself; * such hardware should simply set the %IEEE80211_HW_SUPPORTS_PS hardware * flag. In that case, it will be told about the desired powersave mode * with the %IEEE80211_CONF_PS flag depending on the association status. * The hardware must take care of sending nullfunc frames when necessary, * i.e. when entering and leaving powersave mode. The hardware is required * to look at the AID in beacons and signal to the AP that it woke up when * it finds traffic directed to it. * * %IEEE80211_CONF_PS flag enabled means that the powersave mode defined in * IEEE 802.11-2007 section 11.2 is enabled. This is not to be confused * with hardware wakeup and sleep states. Driver is responsible for waking * up the hardware before issuing commands to the hardware and putting it * back to sleep at appropriate times. * * When PS is enabled, hardware needs to wakeup for beacons and receive the * buffered multicast/broadcast frames after the beacon. Also it must be * possible to send frames and receive the acknowledment frame. * * Other hardware designs cannot send nullfunc frames by themselves and also * need software support for parsing the TIM bitmap. This is also supported * by mac80211 by combining the %IEEE80211_HW_SUPPORTS_PS and * %IEEE80211_HW_PS_NULLFUNC_STACK flags. The hardware is of course still * required to pass up beacons. The hardware is still required to handle * waking up for multicast traffic; if it cannot the driver must handle that * as best as it can; mac80211 is too slow to do that. * * Dynamic powersave is an extension to normal powersave in which the * hardware stays awake for a user-specified period of time after sending a * frame so that reply frames need not be buffered and therefore delayed to * the next wakeup. It's a compromise of getting good enough latency when * there's data traffic and still saving significantly power in idle * periods. * * Dynamic powersave is simply supported by mac80211 enabling and disabling * PS based on traffic. Driver needs to only set %IEEE80211_HW_SUPPORTS_PS * flag and mac80211 will handle everything automatically. Additionally, * hardware having support for the dynamic PS feature may set the * %IEEE80211_HW_SUPPORTS_DYNAMIC_PS flag to indicate that it can support * dynamic PS mode itself. The driver needs to look at the * @dynamic_ps_timeout hardware configuration value and use it that value * whenever %IEEE80211_CONF_PS is set. In this case mac80211 will disable * dynamic PS feature in stack and will just keep %IEEE80211_CONF_PS * enabled whenever user has enabled powersave. * * Driver informs U-APSD client support by enabling * %IEEE80211_VIF_SUPPORTS_UAPSD flag. The mode is configured through the * uapsd parameter in conf_tx() operation. Hardware needs to send the QoS * Nullfunc frames and stay awake until the service period has ended. To * utilize U-APSD, dynamic powersave is disabled for voip AC and all frames * from that AC are transmitted with powersave enabled. * * Note: U-APSD client mode is not yet supported with * %IEEE80211_HW_PS_NULLFUNC_STACK. */ /** * DOC: Beacon filter support * * Some hardware have beacon filter support to reduce host cpu wakeups * which will reduce system power consumption. It usually works so that * the firmware creates a checksum of the beacon but omits all constantly * changing elements (TSF, TIM etc). Whenever the checksum changes the * beacon is forwarded to the host, otherwise it will be just dropped. That * way the host will only receive beacons where some relevant information * (for example ERP protection or WMM settings) have changed. * * Beacon filter support is advertised with the %IEEE80211_VIF_BEACON_FILTER * interface capability. The driver needs to enable beacon filter support * whenever power save is enabled, that is %IEEE80211_CONF_PS is set. When * power save is enabled, the stack will not check for beacon loss and the * driver needs to notify about loss of beacons with ieee80211_beacon_loss(). * * The time (or number of beacons missed) until the firmware notifies the * driver of a beacon loss event (which in turn causes the driver to call * ieee80211_beacon_loss()) should be configurable and will be controlled * by mac80211 and the roaming algorithm in the future. * * Since there may be constantly changing information elements that nothing * in the software stack cares about, we will, in the future, have mac80211 * tell the driver which information elements are interesting in the sense * that we want to see changes in them. This will include * * - a list of information element IDs * - a list of OUIs for the vendor information element * * Ideally, the hardware would filter out any beacons without changes in the * requested elements, but if it cannot support that it may, at the expense * of some efficiency, filter out only a subset. For example, if the device * doesn't support checking for OUIs it should pass up all changes in all * vendor information elements. * * Note that change, for the sake of simplification, also includes information * elements appearing or disappearing from the beacon. * * Some hardware supports an "ignore list" instead. Just make sure nothing * that was requested is on the ignore list, and include commonly changing * information element IDs in the ignore list, for example 11 (BSS load) and * the various vendor-assigned IEs with unknown contents (128, 129, 133-136, * 149, 150, 155, 156, 173, 176, 178, 179, 219); for forward compatibility * it could also include some currently unused IDs. * * * In addition to these capabilities, hardware should support notifying the * host of changes in the beacon RSSI. This is relevant to implement roaming * when no traffic is flowing (when traffic is flowing we see the RSSI of * the received data packets). This can consist of notifying the host when * the RSSI changes significantly or when it drops below or rises above * configurable thresholds. In the future these thresholds will also be * configured by mac80211 (which gets them from userspace) to implement * them as the roaming algorithm requires. * * If the hardware cannot implement this, the driver should ask it to * periodically pass beacon frames to the host so that software can do the * signal strength threshold checking. */ /** * DOC: Spatial multiplexing power save * * SMPS (Spatial multiplexing power save) is a mechanism to conserve * power in an 802.11n implementation. For details on the mechanism * and rationale, please refer to 802.11 (as amended by 802.11n-2009) * "11.2.3 SM power save". * * The mac80211 implementation is capable of sending action frames * to update the AP about the station's SMPS mode, and will instruct * the driver to enter the specific mode. It will also announce the * requested SMPS mode during the association handshake. Hardware * support for this feature is required, and can be indicated by * hardware flags. * * The default mode will be "automatic", which nl80211/cfg80211 * defines to be dynamic SMPS in (regular) powersave, and SMPS * turned off otherwise. * * To support this feature, the driver must set the appropriate * hardware support flags, and handle the SMPS flag to the config() * operation. It will then with this mechanism be instructed to * enter the requested SMPS mode while associated to an HT AP. */ /** * DOC: Frame filtering * * mac80211 requires to see many management frames for proper * operation, and users may want to see many more frames when * in monitor mode. However, for best CPU usage and power consumption, * having as few frames as possible percolate through the stack is * desirable. Hence, the hardware should filter as much as possible. * * To achieve this, mac80211 uses filter flags (see below) to tell * the driver's configure_filter() function which frames should be * passed to mac80211 and which should be filtered out. * * Before configure_filter() is invoked, the prepare_multicast() * callback is invoked with the parameters @mc_count and @mc_list * for the combined multicast address list of all virtual interfaces. * It's use is optional, and it returns a u64 that is passed to * configure_filter(). Additionally, configure_filter() has the * arguments @changed_flags telling which flags were changed and * @total_flags with the new flag states. * * If your device has no multicast address filters your driver will * need to check both the %FIF_ALLMULTI flag and the @mc_count * parameter to see whether multicast frames should be accepted * or dropped. * * All unsupported flags in @total_flags must be cleared. * Hardware does not support a flag if it is incapable of _passing_ * the frame to the stack. Otherwise the driver must ignore * the flag, but not clear it. * You must _only_ clear the flag (announce no support for the * flag to mac80211) if you are not able to pass the packet type * to the stack (so the hardware always filters it). * So for example, you should clear @FIF_CONTROL, if your hardware * always filters control frames. If your hardware always passes * control frames to the kernel and is incapable of filtering them, * you do _not_ clear the @FIF_CONTROL flag. * This rule applies to all other FIF flags as well. */ /** * DOC: AP support for powersaving clients * * In order to implement AP and P2P GO modes, mac80211 has support for * client powersaving, both "legacy" PS (PS-Poll/null data) and uAPSD. * There currently is no support for sAPSD. * * There is one assumption that mac80211 makes, namely that a client * will not poll with PS-Poll and trigger with uAPSD at the same time. * Both are supported, and both can be used by the same client, but * they can't be used concurrently by the same client. This simplifies * the driver code. * * The first thing to keep in mind is that there is a flag for complete * driver implementation: %IEEE80211_HW_AP_LINK_PS. If this flag is set, * mac80211 expects the driver to handle most of the state machine for * powersaving clients and will ignore the PM bit in incoming frames. * Drivers then use ieee80211_sta_ps_transition() to inform mac80211 of * stations' powersave transitions. In this mode, mac80211 also doesn't * handle PS-Poll/uAPSD. * * In the mode without %IEEE80211_HW_AP_LINK_PS, mac80211 will check the * PM bit in incoming frames for client powersave transitions. When a * station goes to sleep, we will stop transmitting to it. There is, * however, a race condition: a station might go to sleep while there is * data buffered on hardware queues. If the device has support for this * it will reject frames, and the driver should give the frames back to * mac80211 with the %IEEE80211_TX_STAT_TX_FILTERED flag set which will * cause mac80211 to retry the frame when the station wakes up. The * driver is also notified of powersave transitions by calling its * @sta_notify callback. * * When the station is asleep, it has three choices: it can wake up, * it can PS-Poll, or it can possibly start a uAPSD service period. * Waking up is implemented by simply transmitting all buffered (and * filtered) frames to the station. This is the easiest case. When * the station sends a PS-Poll or a uAPSD trigger frame, mac80211 * will inform the driver of this with the @allow_buffered_frames * callback; this callback is optional. mac80211 will then transmit * the frames as usual and set the %IEEE80211_TX_CTL_NO_PS_BUFFER * on each frame. The last frame in the service period (or the only * response to a PS-Poll) also has %IEEE80211_TX_STATUS_EOSP set to * indicate that it ends the service period; as this frame must have * TX status report it also sets %IEEE80211_TX_CTL_REQ_TX_STATUS. * When TX status is reported for this frame, the service period is * marked has having ended and a new one can be started by the peer. * * Additionally, non-bufferable MMPDUs can also be transmitted by * mac80211 with the %IEEE80211_TX_CTL_NO_PS_BUFFER set in them. * * Another race condition can happen on some devices like iwlwifi * when there are frames queued for the station and it wakes up * or polls; the frames that are already queued could end up being * transmitted first instead, causing reordering and/or wrong * processing of the EOSP. The cause is that allowing frames to be * transmitted to a certain station is out-of-band communication to * the device. To allow this problem to be solved, the driver can * call ieee80211_sta_block_awake() if frames are buffered when it * is notified that the station went to sleep. When all these frames * have been filtered (see above), it must call the function again * to indicate that the station is no longer blocked. * * If the driver buffers frames in the driver for aggregation in any * way, it must use the ieee80211_sta_set_buffered() call when it is * notified of the station going to sleep to inform mac80211 of any * TIDs that have frames buffered. Note that when a station wakes up * this information is reset (hence the requirement to call it when * informed of the station going to sleep). Then, when a service * period starts for any reason, @release_buffered_frames is called * with the number of frames to be released and which TIDs they are * to come from. In this case, the driver is responsible for setting * the EOSP (for uAPSD) and MORE_DATA bits in the released frames. * To help the @more_data parameter is passed to tell the driver if * there is more data on other TIDs -- the TIDs to release frames * from are ignored since mac80211 doesn't know how many frames the * buffers for those TIDs contain. * * If the driver also implement GO mode, where absence periods may * shorten service periods (or abort PS-Poll responses), it must * filter those response frames except in the case of frames that * are buffered in the driver -- those must remain buffered to avoid * reordering. Because it is possible that no frames are released * in this case, the driver must call ieee80211_sta_eosp() * to indicate to mac80211 that the service period ended anyway. * * Finally, if frames from multiple TIDs are released from mac80211 * but the driver might reorder them, it must clear & set the flags * appropriately (only the last frame may have %IEEE80211_TX_STATUS_EOSP) * and also take care of the EOSP and MORE_DATA bits in the frame. * The driver may also use ieee80211_sta_eosp() in this case. * * Note that if the driver ever buffers frames other than QoS-data * frames, it must take care to never send a non-QoS-data frame as * the last frame in a service period, adding a QoS-nulldata frame * after a non-QoS-data frame if needed. */ /** * DOC: HW queue control * * Before HW queue control was introduced, mac80211 only had a single static * assignment of per-interface AC software queues to hardware queues. This * was problematic for a few reasons: * 1) off-channel transmissions might get stuck behind other frames * 2) multiple virtual interfaces couldn't be handled correctly * 3) after-DTIM frames could get stuck behind other frames * * To solve this, hardware typically uses multiple different queues for all * the different usages, and this needs to be propagated into mac80211 so it * won't have the same problem with the software queues. * * Therefore, mac80211 now offers the %IEEE80211_HW_QUEUE_CONTROL capability * flag that tells it that the driver implements its own queue control. To do * so, the driver will set up the various queues in each &struct ieee80211_vif * and the offchannel queue in &struct ieee80211_hw. In response, mac80211 will * use those queue IDs in the hw_queue field of &struct ieee80211_tx_info and * if necessary will queue the frame on the right software queue that mirrors * the hardware queue. * Additionally, the driver has to then use these HW queue IDs for the queue * management functions (ieee80211_stop_queue() et al.) * * The driver is free to set up the queue mappings as needed; multiple virtual * interfaces may map to the same hardware queues if needed. The setup has to * happen during add_interface or change_interface callbacks. For example, a * driver supporting station+station and station+AP modes might decide to have * 10 hardware queues to handle different scenarios: * * 4 AC HW queues for 1st vif: 0, 1, 2, 3 * 4 AC HW queues for 2nd vif: 4, 5, 6, 7 * after-DTIM queue for AP: 8 * off-channel queue: 9 * * It would then set up the hardware like this: * hw.offchannel_tx_hw_queue = 9 * * and the first virtual interface that is added as follows: * vif.hw_queue[IEEE80211_AC_VO] = 0 * vif.hw_queue[IEEE80211_AC_VI] = 1 * vif.hw_queue[IEEE80211_AC_BE] = 2 * vif.hw_queue[IEEE80211_AC_BK] = 3 * vif.cab_queue = 8 // if AP mode, otherwise %IEEE80211_INVAL_HW_QUEUE * and the second virtual interface with 4-7. * * If queue 6 gets full, for example, mac80211 would only stop the second * virtual interface's BE queue since virtual interface queues are per AC. * * Note that the vif.cab_queue value should be set to %IEEE80211_INVAL_HW_QUEUE * whenever the queue is not used (i.e. the interface is not in AP mode) if the * queue could potentially be shared since mac80211 will look at cab_queue when * a queue is stopped/woken even if the interface is not in AP mode. */ /** * enum ieee80211_filter_flags - hardware filter flags * * These flags determine what the filter in hardware should be * programmed to let through and what should not be passed to the * stack. It is always safe to pass more frames than requested, * but this has negative impact on power consumption. * * @FIF_ALLMULTI: pass all multicast frames, this is used if requested * by the user or if the hardware is not capable of filtering by * multicast address. * * @FIF_FCSFAIL: pass frames with failed FCS (but you need to set the * %RX_FLAG_FAILED_FCS_CRC for them) * * @FIF_PLCPFAIL: pass frames with failed PLCP CRC (but you need to set * the %RX_FLAG_FAILED_PLCP_CRC for them * * @FIF_BCN_PRBRESP_PROMISC: This flag is set during scanning to indicate * to the hardware that it should not filter beacons or probe responses * by BSSID. Filtering them can greatly reduce the amount of processing * mac80211 needs to do and the amount of CPU wakeups, so you should * honour this flag if possible. * * @FIF_CONTROL: pass control frames (except for PS Poll) addressed to this * station * * @FIF_OTHER_BSS: pass frames destined to other BSSes * * @FIF_PSPOLL: pass PS Poll frames * * @FIF_PROBE_REQ: pass probe request frames * * @FIF_MCAST_ACTION: pass multicast Action frames */ enum ieee80211_filter_flags { FIF_ALLMULTI = 1<<1, FIF_FCSFAIL = 1<<2, FIF_PLCPFAIL = 1<<3, FIF_BCN_PRBRESP_PROMISC = 1<<4, FIF_CONTROL = 1<<5, FIF_OTHER_BSS = 1<<6, FIF_PSPOLL = 1<<7, FIF_PROBE_REQ = 1<<8, FIF_MCAST_ACTION = 1<<9, }; /** * enum ieee80211_ampdu_mlme_action - A-MPDU actions * * These flags are used with the ampdu_action() callback in * &struct ieee80211_ops to indicate which action is needed. * * Note that drivers MUST be able to deal with a TX aggregation * session being stopped even before they OK'ed starting it by * calling ieee80211_start_tx_ba_cb_irqsafe, because the peer * might receive the addBA frame and send a delBA right away! * * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either * call ieee80211_start_tx_ba_cb_irqsafe() or * call ieee80211_start_tx_ba_cb_irqsafe() with status * %IEEE80211_AMPDU_TX_START_DELAY_ADDBA to delay addba after * ieee80211_start_tx_ba_cb_irqsafe is called, or just return the special * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting * queued packets, now unaggregated. After all packets are transmitted the * driver has to call ieee80211_stop_tx_ba_cb_irqsafe(). * @IEEE80211_AMPDU_TX_STOP_FLUSH: stop TX aggregation and flush all packets, * called when the station is removed. There's no need or reason to call * ieee80211_stop_tx_ba_cb_irqsafe() in this case as mac80211 assumes the * session is gone and removes the station. * @IEEE80211_AMPDU_TX_STOP_FLUSH_CONT: called when TX aggregation is stopped * but the driver hasn't called ieee80211_stop_tx_ba_cb_irqsafe() yet and * now the connection is dropped and the station will be removed. Drivers * should clean up and drop remaining packets when this is called. */ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_RX_START, IEEE80211_AMPDU_RX_STOP, IEEE80211_AMPDU_TX_START, IEEE80211_AMPDU_TX_STOP_CONT, IEEE80211_AMPDU_TX_STOP_FLUSH, IEEE80211_AMPDU_TX_STOP_FLUSH_CONT, IEEE80211_AMPDU_TX_OPERATIONAL, }; #define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 #define IEEE80211_AMPDU_TX_START_DELAY_ADDBA 2 /** * struct ieee80211_ampdu_params - AMPDU action parameters * * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action. * @sta: peer of this AMPDU session * @tid: tid of the BA session * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When * action is set to %IEEE80211_AMPDU_RX_START the driver passes back the * actual ssn value used to start the session and writes the value here. * @buf_size: reorder buffer size (number of subframes). Valid only when the * action is set to %IEEE80211_AMPDU_RX_START or * %IEEE80211_AMPDU_TX_OPERATIONAL * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU. * valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL * @timeout: BA session timeout. Valid only when the action is set to * %IEEE80211_AMPDU_RX_START */ struct ieee80211_ampdu_params { enum ieee80211_ampdu_mlme_action action; struct ieee80211_sta *sta; u16 tid; u16 ssn; u16 buf_size; bool amsdu; u16 timeout; }; /** * enum ieee80211_frame_release_type - frame release reason * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll * @IEEE80211_FRAME_RELEASE_UAPSD: frame(s) released due to * frame received on trigger-enabled AC */ enum ieee80211_frame_release_type { IEEE80211_FRAME_RELEASE_PSPOLL, IEEE80211_FRAME_RELEASE_UAPSD, }; /** * enum ieee80211_rate_control_changed - flags to indicate what changed * * @IEEE80211_RC_BW_CHANGED: The bandwidth that can be used to transmit * to this station changed. The actual bandwidth is in the station * information -- for HT20/40 the IEEE80211_HT_CAP_SUP_WIDTH_20_40 * flag changes, for HT and VHT the bandwidth field changes. * @IEEE80211_RC_SMPS_CHANGED: The SMPS state of the station changed. * @IEEE80211_RC_SUPP_RATES_CHANGED: The supported rate set of this peer * changed (in IBSS mode) due to discovering more information about * the peer. * @IEEE80211_RC_NSS_CHANGED: N_SS (number of spatial streams) was changed * by the peer */ enum ieee80211_rate_control_changed { IEEE80211_RC_BW_CHANGED = BIT(0), IEEE80211_RC_SMPS_CHANGED = BIT(1), IEEE80211_RC_SUPP_RATES_CHANGED = BIT(2), IEEE80211_RC_NSS_CHANGED = BIT(3), }; /** * enum ieee80211_roc_type - remain on channel type * * With the support for multi channel contexts and multi channel operations, * remain on channel operations might be limited/deferred/aborted by other * flows/operations which have higher priority (and vice versa). * Specifying the ROC type can be used by devices to prioritize the ROC * operations compared to other operations/flows. * * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC. * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required * for sending management frames offchannel. */ enum ieee80211_roc_type { IEEE80211_ROC_TYPE_NORMAL = 0, IEEE80211_ROC_TYPE_MGMT_TX, }; /** * enum ieee80211_reconfig_type - reconfig type * * This enum is used by the reconfig_complete() callback to indicate what * reconfiguration type was completed. * * @IEEE80211_RECONFIG_TYPE_RESTART: hw restart type * (also due to resume() callback returning 1) * @IEEE80211_RECONFIG_TYPE_SUSPEND: suspend type (regardless * of wowlan configuration) */ enum ieee80211_reconfig_type { IEEE80211_RECONFIG_TYPE_RESTART, IEEE80211_RECONFIG_TYPE_SUSPEND, }; /** * struct ieee80211_prep_tx_info - prepare TX information * @duration: if non-zero, hint about the required duration, * only used with the mgd_prepare_tx() method. * @subtype: frame subtype (auth, (re)assoc, deauth, disassoc) * @success: whether the frame exchange was successful, only * used with the mgd_complete_tx() method, and then only * valid for auth and (re)assoc. * @was_assoc: set if this call is due to deauth/disassoc * while just having been associated * @link_id: the link id on which the frame will be TX'ed. * Only used with the mgd_prepare_tx() method. */ struct ieee80211_prep_tx_info { u16 duration; u16 subtype; u8 success:1, was_assoc:1; int link_id; }; /** * struct ieee80211_ops - callbacks from mac80211 to the driver * * This structure contains various callbacks that the driver may * handle or, in some cases, must handle, for example to configure * the hardware to a new channel or to transmit a frame. * * @tx: Handler that 802.11 module calls for each transmitted frame. * skb contains the buffer starting from the IEEE 802.11 header. * The low-level driver should send the frame out based on * configuration in the TX control data. This handler should, * preferably, never fail and stop queues appropriately. * Must be atomic. * * @start: Called before the first netdevice attached to the hardware * is enabled. This should turn on the hardware and must turn on * frame reception (for possibly enabled monitor interfaces.) * Returns negative error codes, these may be seen in userspace, * or zero. * When the device is started it should not have a MAC address * to avoid acknowledging frames before a non-monitor device * is added. * Must be implemented and can sleep. * * @stop: Called after last netdevice attached to the hardware * is disabled. This should turn off the hardware (at least * it must turn off frame reception.) * May be called right after add_interface if that rejects * an interface. If you added any work onto the mac80211 workqueue * you should ensure to cancel it on this callback. * Must be implemented and can sleep. * * @suspend: Suspend the device; mac80211 itself will quiesce before and * stop transmitting and doing any other configuration, and then * ask the device to suspend. This is only invoked when WoWLAN is * configured, otherwise the device is deconfigured completely and * reconfigured at resume time. * The driver may also impose special conditions under which it * wants to use the "normal" suspend (deconfigure), say if it only * supports WoWLAN when the device is associated. In this case, it * must return 1 from this function. * * @resume: If WoWLAN was configured, this indicates that mac80211 is * now resuming its operation, after this the device must be fully * functional again. If this returns an error, the only way out is * to also unregister the device. If it returns 1, then mac80211 * will also go through the regular complete restart on resume. * * @set_wakeup: Enable or disable wakeup when WoWLAN configuration is * modified. The reason is that device_set_wakeup_enable() is * supposed to be called when the configuration changes, not only * in suspend(). * * @add_interface: Called when a netdevice attached to the hardware is * enabled. Because it is not called for monitor mode devices, @start * and @stop must be implemented. * The driver should perform any initialization it needs before * the device can be enabled. The initial configuration for the * interface is given in the conf parameter. * The callback may refuse to add an interface by returning a * negative error code (which will be seen in userspace.) * Must be implemented and can sleep. * * @change_interface: Called when a netdevice changes type. This callback * is optional, but only if it is supported can interface types be * switched while the interface is UP. The callback may sleep. * Note that while an interface is being switched, it will not be * found by the interface iteration callbacks. * * @remove_interface: Notifies a driver that an interface is going down. * The @stop callback is called after this if it is the last interface * and no monitor interfaces are present. * When all interfaces are removed, the MAC address in the hardware * must be cleared so the device no longer acknowledges packets, * the mac_addr member of the conf structure is, however, set to the * MAC address of the device going away. * Hence, this callback must be implemented. It can sleep. * * @config: Handler for configuration requests. IEEE 802.11 code calls this * function to change hardware configuration, e.g., channel. * This function should never fail but returns a negative error code * if it does. The callback can sleep. * * @bss_info_changed: Handler for configuration requests related to BSS * parameters that may vary during BSS's lifespan, and may affect low * level driver (e.g. assoc/disassoc status, erp parameters). * This function should not be used if no BSS has been set, unless * for association indication. The @changed parameter indicates which * of the bss parameters has changed when a call is made. The callback * can sleep. * Note: this callback is called if @vif_cfg_changed or @link_info_changed * are not implemented. * * @vif_cfg_changed: Handler for configuration requests related to interface * (MLD) parameters from &struct ieee80211_vif_cfg that vary during the * lifetime of the interface (e.g. assoc status, IP addresses, etc.) * The @changed parameter indicates which value changed. * The callback can sleep. * * @link_info_changed: Handler for configuration requests related to link * parameters from &struct ieee80211_bss_conf that are related to an * individual link. e.g. legacy/HT/VHT/... rate information. * The @changed parameter indicates which value changed, and the @link_id * parameter indicates the link ID. Note that the @link_id will be 0 for * non-MLO connections. * The callback can sleep. * * @prepare_multicast: Prepare for multicast filter configuration. * This callback is optional, and its return value is passed * to configure_filter(). This callback must be atomic. * * @configure_filter: Configure the device's RX filter. * See the section "Frame filtering" for more information. * This callback must be implemented and can sleep. * * @config_iface_filter: Configure the interface's RX filter. * This callback is optional and is used to configure which frames * should be passed to mac80211. The filter_flags is the combination * of FIF_* flags. The changed_flags is a bit mask that indicates * which flags are changed. * This callback can sleep. * * @set_tim: Set TIM bit. mac80211 calls this function when a TIM bit * must be set or cleared for a given STA. Must be atomic. * * @set_key: See the section "Hardware crypto acceleration" * This callback is only called between add_interface and * remove_interface calls, i.e. while the given virtual interface * is enabled. * Returns a negative error code if the key can't be added. * The callback can sleep. * * @update_tkip_key: See the section "Hardware crypto acceleration" * This callback will be called in the context of Rx. Called for drivers * which set IEEE80211_KEY_FLAG_TKIP_REQ_RX_P1_KEY. * The callback must be atomic. * * @set_rekey_data: If the device supports GTK rekeying, for example while the * host is suspended, it can assign this callback to retrieve the data * necessary to do GTK rekeying, this is the KEK, KCK and replay counter. * After rekeying was done it should (for example during resume) notify * userspace of the new replay counter using ieee80211_gtk_rekey_notify(). * * @set_default_unicast_key: Set the default (unicast) key index, useful for * WEP when the device sends data packets autonomously, e.g. for ARP * offloading. The index can be 0-3, or -1 for unsetting it. * * @hw_scan: Ask the hardware to service the scan request, no need to start * the scan state machine in stack. The scan must honour the channel * configuration done by the regulatory agent in the wiphy's * registered bands. The hardware (or the driver) needs to make sure * that power save is disabled. * The @req ie/ie_len members are rewritten by mac80211 to contain the * entire IEs after the SSID, so that drivers need not look at these * at all but just send them after the SSID -- mac80211 includes the * (extended) supported rates and HT information (where applicable). * When the scan finishes, ieee80211_scan_completed() must be called; * note that it also must be called when the scan cannot finish due to * any error unless this callback returned a negative error code. * This callback is also allowed to return the special return value 1, * this indicates that hardware scan isn't desirable right now and a * software scan should be done instead. A driver wishing to use this * capability must ensure its (hardware) scan capabilities aren't * advertised as more capable than mac80211's software scan is. * The callback can sleep. * * @cancel_hw_scan: Ask the low-level tp cancel the active hw scan. * The driver should ask the hardware to cancel the scan (if possible), * but the scan will be completed only after the driver will call * ieee80211_scan_completed(). * This callback is needed for wowlan, to prevent enqueueing a new * scan_work after the low-level driver was already suspended. * The callback can sleep. * * @sched_scan_start: Ask the hardware to start scanning repeatedly at * specific intervals. The driver must call the * ieee80211_sched_scan_results() function whenever it finds results. * This process will continue until sched_scan_stop is called. * * @sched_scan_stop: Tell the hardware to stop an ongoing scheduled scan. * In this case, ieee80211_sched_scan_stopped() must not be called. * * @sw_scan_start: Notifier function that is called just before a software scan * is started. Can be NULL, if the driver doesn't need this notification. * The mac_addr parameter allows supporting NL80211_SCAN_FLAG_RANDOM_ADDR, * the driver may set the NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR flag if it * can use this parameter. The callback can sleep. * * @sw_scan_complete: Notifier function that is called just after a * software scan finished. Can be NULL, if the driver doesn't need * this notification. * The callback can sleep. * * @get_stats: Return low-level statistics. * Returns zero if statistics are available. * The callback can sleep. * * @get_key_seq: If your device implements encryption in hardware and does * IV/PN assignment then this callback should be provided to read the * IV/PN for the given key from hardware. * The callback must be atomic. * * @set_frag_threshold: Configuration of fragmentation threshold. Assign this * if the device does fragmentation by itself. Note that to prevent the * stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG * should be set as well. * The callback can sleep. * * @set_rts_threshold: Configuration of RTS threshold (if device needs it) * The callback can sleep. * * @sta_add: Notifies low level driver about addition of an associated station, * AP, IBSS/WDS/mesh peer etc. This callback can sleep. * * @sta_remove: Notifies low level driver about removal of an associated * station, AP, IBSS/WDS/mesh peer etc. Note that after the callback * returns it isn't safe to use the pointer, not even RCU protected; * no RCU grace period is guaranteed between returning here and freeing * the station. See @sta_pre_rcu_remove if needed. * This callback can sleep. * * @vif_add_debugfs: Drivers can use this callback to add a debugfs vif * directory with its files. This callback should be within a * CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. * * @link_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 vif. This callback should be within * a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep. * For non-MLO the callback will be called once for the default bss_conf * with the vif's directory rather than a separate subdirectory. * * @sta_add_debugfs: Drivers can use this callback to add debugfs files * when a station is added to mac80211's station list. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files * when a link is added to a mac80211 station. This callback * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * For non-MLO the callback will be called once for the deflink with the * station's directory rather than a separate subdirectory. * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating * in AP mode, this callback will not be called when the flag * %IEEE80211_HW_AP_LINK_PS is set. Must be atomic. * * @sta_set_txpwr: Configure the station tx power. This callback set the tx * power for the station. * This callback can sleep. * * @sta_state: Notifies low level driver about state transition of a * station (which can be the AP, a client, IBSS/WDS/mesh peer etc.) * This callback is mutually exclusive with @sta_add/@sta_remove. * It must not fail for down transitions but may fail for transitions * up the list of states. Also note that after the callback returns it * isn't safe to use the pointer, not even RCU protected - no RCU grace * period is guaranteed between returning here and freeing the station. * See @sta_pre_rcu_remove if needed. * The callback can sleep. * * @sta_pre_rcu_remove: Notify driver about station removal before RCU * synchronisation. This is useful if a driver needs to have station * pointers protected using RCU, it can then use this call to clear * the pointers instead of waiting for an RCU grace period to elapse * in @sta_state. * The callback can sleep. * * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since * otherwise the rate control algorithm is notified directly. * Must be atomic. * @sta_rate_tbl_update: Notifies the driver that the rate table changed. This * is only used if the configured rate control algorithm actually uses * the new rate table API, and is therefore optional. Must be atomic. * * @sta_statistics: Get statistics for this station. For example with beacon * filtering, the statistics kept by mac80211 might not be accurate, so * let the driver pre-fill the statistics. The driver can fill most of * the values (indicating which by setting the filled bitmap), but not * all of them make sense - see the source for which ones are possible. * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. * The callback can sleep. * * @get_tsf: Get the current TSF timer value from firmware/hardware. Currently, * this is only used for IBSS mode BSSID merging and debugging. Is not a * required function. * The callback can sleep. * * @set_tsf: Set the TSF timer to the specified value in the firmware/hardware. * Currently, this is only used for IBSS mode debugging. Is not a * required function. * The callback can sleep. * * @offset_tsf: Offset the TSF timer by the specified value in the * firmware/hardware. Preferred to set_tsf as it avoids delay between * calling set_tsf() and hardware getting programmed, which will show up * as TSF delay. Is not a required function. * The callback can sleep. * * @reset_tsf: Reset the TSF timer and allow firmware/hardware to synchronize * with other STAs in the IBSS. This is only used in IBSS mode. This * function is optional if the firmware/hardware takes full care of * TSF synchronization. * The callback can sleep. * * @tx_last_beacon: Determine whether the last IBSS beacon was sent by us. * This is needed only for IBSS mode and the result of this function is * used to determine whether to reply to Probe Requests. * Returns non-zero if this device sent the last beacon. * The callback can sleep. * * @get_survey: Return per-channel survey information * * @rfkill_poll: Poll rfkill hardware state. If you need this, you also * need to set wiphy->rfkill_poll to %true before registration, * and need to call wiphy_rfkill_set_hw_state() in the callback. * The callback can sleep. * * @set_coverage_class: Set slot time for given coverage class as specified * in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout * accordingly; coverage class equals to -1 to enable ACK timeout * estimation algorithm (dynack). To disable dynack set valid value for * coverage class. This callback is not required and may sleep. * * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may * be %NULL. The callback can sleep. * @testmode_dump: Implement a cfg80211 test mode dump. The callback can sleep. * * @flush: Flush all pending frames from the hardware queue, making sure * that the hardware queues are empty. The @queues parameter is a bitmap * of queues to flush, which is useful if different virtual interfaces * use different hardware queues; it may also indicate all queues. * If the parameter @drop is set to %true, pending frames may be dropped. * Note that vif can be NULL. * The callback can sleep. * * @flush_sta: Flush or drop all pending frames from the hardware queue(s) for * the given station, as it's about to be removed. * The callback can sleep. * * @channel_switch: Drivers that need (or want) to offload the channel * switch operation for CSAs received from the AP may implement this * callback. They must then call ieee80211_chswitch_done() to indicate * completion of the channel switch. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may * reject TX/RX mask combinations they cannot support by returning -EINVAL * (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX). * * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant). * * @remain_on_channel: Starts an off-channel period on the given channel, must * call back to ieee80211_ready_on_channel() when on that channel. Note * that normal channel traffic is not stopped as this is intended for hw * offload. Frames to transmit on the off-channel channel are transmitted * normally except for the %IEEE80211_TX_CTL_TX_OFFCHAN flag. When the * duration (which will always be non-zero) expires, the driver must call * ieee80211_remain_on_channel_expired(). * Note that this callback may be called while the device is in IDLE and * must be accepted in this case. * This callback may sleep. * @cancel_remain_on_channel: Requests that an ongoing off-channel period is * aborted before it expires. This callback may sleep. * * @set_ringparam: Set tx and rx ring sizes. * * @get_ringparam: Get tx and rx ring current and maximum sizes. * * @tx_frames_pending: Check if there is any pending frame in the hardware * queues before entering power save. * * @set_bitrate_mask: Set a mask of rates to be used for rate control selection * when transmitting a frame. Currently only legacy rates are handled. * The callback can sleep. * @event_callback: Notify driver about any event in mac80211. See * &enum ieee80211_event_type for the different types. * The callback must be atomic. * * @release_buffered_frames: Release buffered frames according to the given * parameters. In the case where the driver buffers some frames for * sleeping stations mac80211 will use this callback to tell the driver * to release some frames, either for PS-poll or uAPSD. * Note that if the @more_data parameter is %false the driver must check * if there are more frames on the given TIDs, and if there are more than * the frames being released then it must still set the more-data bit in * the frame. If the @more_data parameter is %true, then of course the * more-data bit must always be set. * The @tids parameter tells the driver which TIDs to release frames * from, for PS-poll it will always have only a single bit set. * In the case this is used for a PS-poll initiated release, the * @num_frames parameter will always be 1 so code can be shared. In * this case the driver must also set %IEEE80211_TX_STATUS_EOSP flag * on the TX status (and must report TX status) so that the PS-poll * period is properly ended. This is used to avoid sending multiple * responses for a retried PS-poll frame. * In the case this is used for uAPSD, the @num_frames parameter may be * bigger than one, but the driver may send fewer frames (it must send * at least one, however). In this case it is also responsible for * setting the EOSP flag in the QoS header of the frames. Also, when the * service period ends, the driver must set %IEEE80211_TX_STATUS_EOSP * on the last frame in the SP. Alternatively, it may call the function * ieee80211_sta_eosp() to inform mac80211 of the end of the SP. * This callback must be atomic. * @allow_buffered_frames: Prepare device to allow the given number of frames * to go out to the given station. The frames will be sent by mac80211 * via the usual TX path after this call. The TX information for frames * released will also have the %IEEE80211_TX_CTL_NO_PS_BUFFER flag set * and the last one will also have %IEEE80211_TX_STATUS_EOSP set. In case * frames from multiple TIDs are released and the driver might reorder * them between the TIDs, it must set the %IEEE80211_TX_STATUS_EOSP flag * on the last frame and clear it on all others and also handle the EOSP * bit in the QoS header correctly. Alternatively, it can also call the * ieee80211_sta_eosp() function. * The @tids parameter is a bitmap and tells the driver which TIDs the * frames will be on; it will at most have two bits set. * This callback must be atomic. * * @get_et_sset_count: Ethtool API to get string-set count. * Note that the wiphy mutex is not held for this callback since it's * expected to return a static value. * * @get_et_stats: Ethtool API to get a set of u64 stats. * * @get_et_strings: Ethtool API to get a set of strings to describe stats * and perhaps other supported types of ethtool data-sets. * Note that the wiphy mutex is not held for this callback since it's * expected to return a static value. * * @mgd_prepare_tx: Prepare for transmitting a management frame for association * before associated. In multi-channel scenarios, a virtual interface is * bound to a channel before it is associated, but as it isn't associated * yet it need not necessarily be given airtime, in particular since any * transmission to a P2P GO needs to be synchronized against the GO's * powersave state. mac80211 will call this function before transmitting a * management frame prior to transmitting that frame to allow the driver * to give it channel time for the transmission, to get a response and be * able to synchronize with the GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. * Additional information is passed in the &struct ieee80211_prep_tx_info * data. If duration there is greater than zero, mac80211 hints to the * driver the duration for which the operation is requested. * The callback is optional and can (should!) sleep. * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's * channel. We must stay on the channel (no PSM, scan, etc.), since a TDLS * setup-response is a direct packet not buffered by the AP. * mac80211 will call this function just before the transmission of a TDLS * discovery-request. The recommended period of protection is at least * 2 * (DTIM period). * The callback is optional and can sleep. * * @add_chanctx: Notifies device driver about new channel context creation. * This callback may sleep. * @remove_chanctx: Notifies device driver about channel context destruction. * This callback may sleep. * @change_chanctx: Notifies device driver about channel context changes that * may happen when combining different virtual interfaces on the same * channel context with different settings * This callback may sleep. * @assign_vif_chanctx: Notifies device driver about channel context being bound * to vif. Possible use is for hw queue remapping. * This callback may sleep. * @unassign_vif_chanctx: Notifies device driver about channel context being * unbound from vif. * This callback may sleep. * @switch_vif_chanctx: switch a number of vifs from one chanctx to * another, as specified in the list of * @ieee80211_vif_chanctx_switch passed to the driver, according * to the mode defined in &ieee80211_chanctx_switch_mode. * This callback may sleep. * * @start_ap: Start operation on the AP interface, this is called after all the * information in bss_conf is set and beacon can be retrieved. A channel * context is bound before this is called. Note that if the driver uses * software scan or ROC, this (and @stop_ap) isn't called when the AP is * just "paused" for scanning/ROC, which is indicated by the beacon being * disabled/enabled via @bss_info_changed. * @stop_ap: Stop operation on the AP interface. * * @reconfig_complete: Called after a call to ieee80211_restart_hw() and * during resume, when the reconfiguration has completed. * This can help the driver implement the reconfiguration step (and * indicate mac80211 is ready to receive frames). * This callback may sleep. * * @ipv6_addr_change: IPv6 address assignment on the given interface changed. * Currently, this is only called for managed or P2P client interfaces. * This callback is optional; it must not sleep. * * @channel_switch_beacon: Starts a channel switch to a new channel. * Beacons are modified to include CSA or ECSA IEs before calling this * function. The corresponding count fields in these IEs must be * decremented, and when they reach 1 the driver must call * ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get() * get the csa counter decremented by mac80211, but must check if it is * 1 using ieee80211_beacon_counter_is_complete() after the beacon has been * transmitted and then call ieee80211_csa_finish(). * If the CSA count starts as zero or 1, this function will not be called, * since there won't be any time to beacon before the switch anyway. * @pre_channel_switch: This is an optional callback that is called * before a channel switch procedure is started (ie. when a STA * gets a CSA or a userspace initiated channel-switch), allowing * the driver to prepare for the channel switch. * @post_channel_switch: This is an optional callback that is called * after a channel switch procedure is completed, allowing the * driver to go back to a normal configuration. * @abort_channel_switch: This is an optional callback that is called * when channel switch procedure was aborted, allowing the * driver to go back to a normal configuration. * @channel_switch_rx_beacon: This is an optional callback that is called * when channel switch procedure is in progress and additional beacon with * CSA IE was received, allowing driver to track changes in count. * @join_ibss: Join an IBSS (on an IBSS interface); this is called after all * information in bss_conf is set up and the beacon can be retrieved. A * channel context is bound before this is called. * @leave_ibss: Leave the IBSS again. * * @get_expected_throughput: extract the expected throughput towards the * specified station. The returned value is expressed in Kbps. It returns 0 * if the RC algorithm does not have proper data to provide. * * @get_txpower: get current maximum tx power (in dBm) based on configuration * and hardware limits. * * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver * is responsible for continually initiating channel-switching operations * and returning to the base channel for communication with the AP. The * driver receives a channel-switch request template and the location of * the switch-timing IE within the template as part of the invocation. * The template is valid only within the call, and the driver can * optionally copy the skb for further re-use. * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both * peers must be on the base channel when the call completes. * @tdls_recv_channel_switch: a TDLS channel-switch related frame (request or * response) has been received from a remote peer. The driver gets * parameters parsed from the incoming frame and may use them to continue * an ongoing channel-switch operation. In addition, a channel-switch * response template is provided, together with the location of the * switch-timing IE within the template. The skb can only be used within * the function call. * * @wake_tx_queue: Called when new packets have been added to the queue. * @sync_rx_queues: Process all pending frames in RSS queues. This is a * synchronization which is needed in case driver has in its RSS queues * pending frames that were received prior to the control path action * currently taken (e.g. disassociation) but are not processed yet. * * @start_nan: join an existing NAN cluster, or create a new one. * @stop_nan: leave the NAN cluster. * @nan_change_conf: change NAN configuration. The data in cfg80211_nan_conf * contains full new configuration and changes specify which parameters * are changed with respect to the last NAN config. * The driver gets both full configuration and the changed parameters since * some devices may need the full configuration while others need only the * changed parameters. * @add_nan_func: Add a NAN function. Returns 0 on success. The data in * cfg80211_nan_func must not be referenced outside the scope of * this call. * @del_nan_func: Remove a NAN function. The driver must call * ieee80211_nan_func_terminated() with * NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal. * @can_aggregate_in_amsdu: Called in order to determine if HW supports * aggregating two specific frames in the same A-MSDU. The relation * between the skbs should be symmetric and transitive. Note that while * skb is always a real frame, head may or may not be an A-MSDU. * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available. * Statistics should be cumulative, currently no way to reset is provided. * * @start_pmsr: start peer measurement (e.g. FTM) (this call can sleep) * @abort_pmsr: abort peer measurement (this call can sleep) * @set_tid_config: Apply TID specific configurations. This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer. * This callback may sleep. * @update_vif_offload: Update virtual interface offload flags * This callback may sleep. * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode * @set_sar_specs: Update the SAR (TX power) settings. * @sta_set_decap_offload: Called to notify the driver when a station is allowed * to use rx decapsulation offload * @add_twt_setup: Update hw with TWT agreement parameters received from the peer. * This callback allows the hw to check if requested parameters * are supported and if there is enough room for a new agreement. * The hw is expected to set agreement result in the req_type field of * twt structure. * @twt_teardown_request: Update the hw with TWT teardown request received * from the peer. * @set_radar_background: Configure dedicated offchannel chain available for * radar/CAC detection on some hw. This chain can't be used to transmit * or receive frames and it is bounded to a running wdev. * Background radar/CAC detection allows to avoid the CAC downtime * switching to a different channel during CAC detection on the selected * radar channel. * The caller is expected to set chandef pointer to NULL in order to * disable background CAC/radar detection. * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to * resolve a path for hardware flow offloading * @can_activate_links: Checks if a specific active_links bitmap is * supported by the driver. * @change_vif_links: Change the valid links on an interface, note that while * removing the old link information is still valid (link_conf pointer), * but may immediately disappear after the function returns. The old or * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. * Note that a sta can also be inserted or removed with valid links, * i.e. passed to @sta_add/@sta_state with sta->valid_links not zero. * In fact, cannot change from having valid_links and not having them. * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is * not restored at HW reset by mac80211 so drivers need to take care of * that. * @net_setup_tc: Called from .ndo_setup_tc in order to prepare hardware * flow offloading for flows originating from the vif. * Note that the driver must not assume that the vif driver_data is valid * at this point, since the callback can be called during netdev teardown. * @can_neg_ttlm: for managed interface, requests the driver to determine * if the requested TID-To-Link mapping can be accepted or not. * If it's not accepted the driver may suggest a preferred mapping and * modify @ttlm parameter with the suggested TID-to-Link mapping. * @prep_add_interface: prepare for interface addition. This can be used by * drivers to prepare for the addition of a new interface, e.g., allocate * the needed resources etc. This callback doesn't guarantee that an * interface with the specified type would be added, and thus drivers that * implement this callback need to handle such cases. The type is the full * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, struct ieee80211_tx_control *control, struct sk_buff *skb); int (*start)(struct ieee80211_hw *hw); void (*stop)(struct ieee80211_hw *hw, bool suspend); #ifdef CONFIG_PM int (*suspend)(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan); int (*resume)(struct ieee80211_hw *hw); void (*set_wakeup)(struct ieee80211_hw *hw, bool enabled); #endif int (*add_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*change_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*config)(struct ieee80211_hw *hw, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed); void (*vif_cfg_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed); void (*link_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, u64 changed); int (*start_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*stop_ap)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); u64 (*prepare_multicast)(struct ieee80211_hw *hw, struct netdev_hw_addr_list *mc_list); void (*configure_filter)(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, u64 multicast); void (*config_iface_filter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int filter_flags, unsigned int changed_flags); int (*set_tim)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set); int (*set_key)(struct ieee80211_hw *hw, enum set_key_cmd cmd, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); void (*update_tkip_key)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_key_conf *conf, struct ieee80211_sta *sta, u32 iv32, u16 *phase1key); void (*set_rekey_data)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_gtk_rekey_data *data); void (*set_default_unicast_key)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int idx); int (*hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_scan_request *req); void (*cancel_hw_scan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*sched_scan_start)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_sched_scan_request *req, struct ieee80211_scan_ies *ies); int (*sched_scan_stop)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*sw_scan_start)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const u8 *mac_addr); void (*sw_scan_complete)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*get_stats)(struct ieee80211_hw *hw, struct ieee80211_low_level_stats *stats); void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); #ifdef CONFIG_MAC80211_DEBUGFS void (*vif_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*link_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct dentry *dir); void (*sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); void (*link_sta_add_debugfs)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); int (*sta_set_txpwr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*link_sta_rc_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*sta_statistics)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct station_info *sinfo); int (*conf_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 ac, const struct ieee80211_tx_queue_params *params); u64 (*get_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*set_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 tsf); void (*offset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); /** * @ampdu_action: * Perform a certain A-MPDU action. * The RA/TID combination determines the destination and TID we want * the ampdu action to be performed for. The action is defined through * ieee80211_ampdu_mlme_action. * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver * may neither send aggregates containing more subframes than @buf_size * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be * possible with a buf_size of 8: * * - ``TX: 1.....7`` * - ``RX: 2....7`` (lost frame #1) * - ``TX: 8..1...`` * * which is invalid since #1 was now re-transmitted well past the * buffer size of 8. Correct ways to retransmit #1 would be: * * - ``TX: 1 or`` * - ``TX: 18 or`` * - ``TX: 81`` * * Even ``189`` would be wrong since 1 could be lost again. * * Returns a negative error code on failure. The driver may return * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START * if the session can start immediately. * * The callback can sleep. */ int (*ampdu_action)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params); int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); int (*testmode_dump)(struct ieee80211_hw *hw, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len); #endif void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); void (*flush_sta)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel *chan, int duration, enum ieee80211_roc_type type); int (*cancel_remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*set_ringparam)(struct ieee80211_hw *hw, u32 tx, u32 rx); void (*get_ringparam)(struct ieee80211_hw *hw, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max); bool (*tx_frames_pending)(struct ieee80211_hw *hw); int (*set_bitrate_mask)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_bitrate_mask *mask); void (*event_callback)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct ieee80211_event *event); void (*allow_buffered_frames)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); void (*release_buffered_frames)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u16 tids, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); int (*get_et_sset_count)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int sset); void (*get_et_stats)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ethtool_stats *stats, u64 *data); void (*get_et_strings)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 sset, u8 *data); void (*mgd_prepare_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); void (*mgd_complete_tx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id); int (*add_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void (*remove_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void (*change_chanctx)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed); int (*assign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); void (*unassign_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx_conf *ctx); int (*switch_vif_chanctx)(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); void (*reconfig_complete)(struct ieee80211_hw *hw, enum ieee80211_reconfig_type reconfig_type); #if IS_ENABLED(CONFIG_IPV6) void (*ipv6_addr_change)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct inet6_dev *idev); #endif void (*channel_switch_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_chan_def *chandef); int (*pre_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*post_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*abort_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 oper_class, struct cfg80211_chan_def *chandef, struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie); void (*tdls_cancel_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); void (*tdls_recv_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_tdls_ch_sw_params *params); void (*wake_tx_queue)(struct ieee80211_hw *hw, struct ieee80211_txq *txq); void (*sync_rx_queues)(struct ieee80211_hw *hw); int (*start_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf); int (*stop_nan)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*nan_change_conf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_nan_conf *conf, u32 changes); int (*add_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const struct cfg80211_nan_func *nan_func); void (*del_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u8 instance_id); bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw, struct sk_buff *head, struct sk_buff *skb); int (*get_ftm_responder_stats)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_ftm_responder_stats *ftm_stats); int (*start_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request); void (*abort_pmsr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_pmsr_request *request); int (*set_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct cfg80211_tid_config *tid_conf); int (*reset_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 tids); void (*update_vif_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*sta_set_4addr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); int (*set_sar_specs)(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); void (*sta_set_decap_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, bool enabled); void (*add_twt_setup)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct ieee80211_twt_setup *twt); void (*twt_teardown_request)(struct ieee80211_hw *hw, struct ieee80211_sta *sta, u8 flowid); int (*set_radar_background)(struct ieee80211_hw *hw, struct cfg80211_chan_def *chandef); int (*net_fill_forward_path)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path); bool (*can_activate_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 active_links); int (*change_vif_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]); int (*change_sta_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u16 old_links, u16 new_links); int (*set_hw_timestamp)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct cfg80211_set_hw_timestamp *hwts); int (*net_setup_tc)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct net_device *dev, enum tc_setup_type type, void *type_data); enum ieee80211_neg_ttlm_res (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *ttlm); void (*prep_add_interface)(struct ieee80211_hw *hw, enum nl80211_iftype type); }; /** * ieee80211_alloc_hw_nm - Allocate a new hardware device * * This must be called once for each hardware device. The returned pointer * must be used to refer to this device when calling other functions. * mac80211 allocates a private data area for the driver pointed to by * @priv in &struct ieee80211_hw, the size of this area is given as * @priv_data_len. * * @priv_data_len: length of private data * @ops: callbacks for this device * @requested_name: Requested name for this device. * NULL is valid value, and means use the default naming (phy%d) * * Return: A pointer to the new hardware device, or %NULL on error. */ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, const struct ieee80211_ops *ops, const char *requested_name); /** * ieee80211_alloc_hw - Allocate a new hardware device * * This must be called once for each hardware device. The returned pointer * must be used to refer to this device when calling other functions. * mac80211 allocates a private data area for the driver pointed to by * @priv in &struct ieee80211_hw, the size of this area is given as * @priv_data_len. * * @priv_data_len: length of private data * @ops: callbacks for this device * * Return: A pointer to the new hardware device, or %NULL on error. */ static inline struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { return ieee80211_alloc_hw_nm(priv_data_len, ops, NULL); } /** * ieee80211_register_hw - Register hardware device * * You must call this function before any other functions in * mac80211. Note that before a hardware can be registered, you * need to fill the contained wiphy's information. * * @hw: the device to register as returned by ieee80211_alloc_hw() * * Return: 0 on success. An error code otherwise. */ int ieee80211_register_hw(struct ieee80211_hw *hw); /** * struct ieee80211_tpt_blink - throughput blink description * @throughput: throughput in Kbit/sec * @blink_time: blink time in milliseconds * (full cycle, ie. one off + one on period) */ struct ieee80211_tpt_blink { int throughput; int blink_time; }; /** * enum ieee80211_tpt_led_trigger_flags - throughput trigger flags * @IEEE80211_TPT_LEDTRIG_FL_RADIO: enable blinking with radio * @IEEE80211_TPT_LEDTRIG_FL_WORK: enable blinking when working * @IEEE80211_TPT_LEDTRIG_FL_CONNECTED: enable blinking when at least one * interface is connected in some way, including being an AP */ enum ieee80211_tpt_led_trigger_flags { IEEE80211_TPT_LEDTRIG_FL_RADIO = BIT(0), IEEE80211_TPT_LEDTRIG_FL_WORK = BIT(1), IEEE80211_TPT_LEDTRIG_FL_CONNECTED = BIT(2), }; #ifdef CONFIG_MAC80211_LEDS const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw); const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw); const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len); #endif /** * ieee80211_get_tx_led_name - get name of TX LED * * mac80211 creates a transmit LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_tx_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_rx_led_name - get name of RX LED * * mac80211 creates a receive LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_rx_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_assoc_led_name - get name of association LED * * mac80211 creates a association LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_assoc_led_name(hw); #else return NULL; #endif } /** * ieee80211_get_radio_led_name - get name of radio LED * * mac80211 creates a radio change LED trigger for each wireless hardware * that can be used to drive LEDs if your driver registers a LED device. * This function returns the name (or %NULL if not configured for LEDs) * of the trigger so you can automatically link the LED device. * * @hw: the hardware to get the LED trigger name for * * Return: The name of the LED trigger. %NULL if not configured for LEDs. */ static inline const char *ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_get_radio_led_name(hw); #else return NULL; #endif } /** * ieee80211_create_tpt_led_trigger - create throughput LED trigger * @hw: the hardware to create the trigger for * @flags: trigger flags, see &enum ieee80211_tpt_led_trigger_flags * @blink_table: the blink table -- needs to be ordered by throughput * @blink_table_len: size of the blink table * * Return: %NULL (in case of error, or if no LED triggers are * configured) or the name of the new trigger. * * Note: This function must be called before ieee80211_register_hw(). */ static inline const char * ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { #ifdef CONFIG_MAC80211_LEDS return __ieee80211_create_tpt_led_trigger(hw, flags, blink_table, blink_table_len); #else return NULL; #endif } /** * ieee80211_unregister_hw - Unregister a hardware device * * This function instructs mac80211 to free allocated resources * and unregister netdevices from the networking subsystem. * * @hw: the hardware to unregister */ void ieee80211_unregister_hw(struct ieee80211_hw *hw); /** * ieee80211_free_hw - free hardware descriptor * * This function frees everything that was allocated, including the * private data for the driver. You must call ieee80211_unregister_hw() * before calling this function. * * @hw: the hardware to free */ void ieee80211_free_hw(struct ieee80211_hw *hw); /** * ieee80211_restart_hw - restart hardware completely * * Call this function when the hardware was restarted for some reason * (hardware error, ...) and the driver is unable to restore its state * by itself. mac80211 assumes that at this point the driver/hardware * is completely uninitialised and stopped, it starts the process by * calling the ->start() operation. The driver will need to reset all * internal state that it has prior to calling this function. * * @hw: the hardware to restart */ void ieee80211_restart_hw(struct ieee80211_hw *hw); /** * ieee80211_rx_list - receive frame and store processed skbs in a list * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled and RCU read lock * * @hw: the hardware this frame came in on * @sta: the station the frame was received from, or %NULL * @skb: the buffer to receive, owned by mac80211 after this call * @list: the destination list */ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct sk_buff *skb, struct list_head *list); /** * ieee80211_rx_napi - receive frame from NAPI context * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * This function must be called with BHs disabled. * * @hw: the hardware this frame came in on * @sta: the station the frame was received from, or %NULL * @skb: the buffer to receive, owned by mac80211 after this call * @napi: the NAPI context */ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct sk_buff *skb, struct napi_struct *napi); /** * ieee80211_rx - receive frame * * Use this function to hand received frames to mac80211. The receive * buffer in @skb must start with an IEEE 802.11 header. In case of a * paged @skb is used, the driver is recommended to put the ieee80211 * header of the frame on the linear part of the @skb to avoid memory * allocation and/or memcpy by the stack. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls to * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be * mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * In process context use instead ieee80211_rx_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) { ieee80211_rx_napi(hw, NULL, skb, NULL); } /** * ieee80211_rx_irqsafe - receive frame * * Like ieee80211_rx() but can be called in IRQ context * (internally defers to a tasklet.) * * Calls to this function, ieee80211_rx() or ieee80211_rx_ni() may not * be mixed for a single hardware.Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_rx_ni - receive frame (in process context) * * Like ieee80211_rx() but can be called in process context * (internally disables bottom halves). * * Calls to this function, ieee80211_rx() and ieee80211_rx_irqsafe() may * not be mixed for a single hardware. Must not run concurrently with * ieee80211_tx_status_skb() or ieee80211_tx_status_ni(). * * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ static inline void ieee80211_rx_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); ieee80211_rx(hw, skb); local_bh_enable(); } /** * ieee80211_sta_ps_transition - PS transition for connected sta * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS * flag set, use this function to inform mac80211 about a connected station * entering/leaving PS mode. * * This function may not be called in IRQ context or with softirqs enabled. * * Calls to this function for a single hardware must be synchronized against * each other. * * @sta: currently connected sta * @start: start or stop PS * * Return: 0 on success. -EINVAL when the requested PS mode is already set. */ int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start); /** * ieee80211_sta_ps_transition_ni - PS transition for connected sta * (in process context) * * Like ieee80211_sta_ps_transition() but can be called in process context * (internally disables bottom halves). Concurrent call restriction still * applies. * * @sta: currently connected sta * @start: start or stop PS * * Return: Like ieee80211_sta_ps_transition(). */ static inline int ieee80211_sta_ps_transition_ni(struct ieee80211_sta *sta, bool start) { int ret; local_bh_disable(); ret = ieee80211_sta_ps_transition(sta, start); local_bh_enable(); return ret; } /** * ieee80211_sta_pspoll - PS-Poll frame received * @sta: currently connected station * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set, * use this function to inform mac80211 that a PS-Poll frame from a * connected station was received. * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_uapsd_trigger(); calls to all three must * be serialized. */ void ieee80211_sta_pspoll(struct ieee80211_sta *sta); /** * ieee80211_sta_uapsd_trigger - (potential) U-APSD trigger frame received * @sta: currently connected station * @tid: TID of the received (potential) trigger frame * * When operating in AP mode with the %IEEE80211_HW_AP_LINK_PS flag set, * use this function to inform mac80211 that a (potential) trigger frame * from a connected station was received. * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_pspoll(); calls to all three must be * serialized. * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown. * In this case, mac80211 will not check that this tid maps to an AC * that is trigger enabled and assume that the caller did the proper * checks. */ void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid); /* * The TX headroom reserved by mac80211 for its own tx_status functions. * This is enough for the radiotap header. */ #define IEEE80211_TX_STATUS_HEADROOM ALIGN(14, 4) /** * ieee80211_sta_set_buffered - inform mac80211 about driver-buffered frames * @sta: &struct ieee80211_sta pointer for the sleeping station * @tid: the TID that has buffered frames * @buffered: indicates whether or not frames are buffered for this TID * * If a driver buffers frames for a powersave station instead of passing * them back to mac80211 for retransmission, the station may still need * to be told that there are buffered frames via the TIM bit. * * This function informs mac80211 whether or not there are frames that are * buffered in the driver for a given TID; mac80211 can then use this data * to set the TIM bit (NOTE: This may call back into the driver's set_tim * call! Beware of the locking!) * * If all frames are released to the station (due to PS-poll or uAPSD) * then the driver needs to inform mac80211 that there no longer are * frames buffered. However, when the station wakes up mac80211 assumes * that all buffered frames will be transmitted and clears this data, * drivers need to make sure they inform mac80211 about all buffered * frames on the sleep transition (sta_notify() with %STA_NOTIFY_SLEEP). * * Note that technically mac80211 only needs to know this per AC, not per * TID, but since driver buffering will inevitably happen per TID (since * it is related to aggregation) it is easier to make mac80211 map the * TID to the AC as required instead of keeping track in all drivers that * use this API. */ void ieee80211_sta_set_buffered(struct ieee80211_sta *sta, u8 tid, bool buffered); /** * ieee80211_get_tx_rates - get the selected transmit rates for a packet * * Call this function in a driver with per-packet rate selection support * to combine the rate info in the packet tx info with the most recent * rate selection table for the station entry. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @sta: the receiver station to which this packet is sent. * @skb: the frame to be transmitted. * @dest: buffer for extracted rate/retry information * @max_rates: maximum number of rates to fetch */ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff *skb, struct ieee80211_tx_rate *dest, int max_rates); /** * ieee80211_sta_set_expected_throughput - set the expected tpt for a station * * Call this function to notify mac80211 about a change in expected throughput * to a station. A driver for a device that does rate control in firmware can * call this function when the expected throughput estimate towards a station * changes. The information is used to tune the CoDel AQM applied to traffic * going towards that station (which can otherwise be too aggressive and cause * slow stations to starve). * * @pubsta: the station to set throughput for. * @thr: the current expected throughput in kbps. */ void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr); /** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta * This function can be used in drivers that does not have provision * in updating the tx rate in data path. * * @hw: the hardware the frame was transmitted by * @pubsta: the station to update the tx rate for. * @info: tx status information */ void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info); /** * ieee80211_tx_status_skb - transmit status callback * * Call this function for all transmitted frames after they have been * transmitted. It is permissible to not call this function for * multicast frames but this can affect statistics. * * This function may not be called in IRQ context. Calls to this function * for a single hardware must be synchronized against each other. Calls * to this function, ieee80211_tx_status_ni() and ieee80211_tx_status_irqsafe() * may not be mixed for a single hardware. Must not run concurrently with * ieee80211_rx() or ieee80211_rx_ni(). * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_tx_status_ext - extended transmit status callback * * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that may want to provide extra information that does not * fit into &struct ieee80211_tx_info. * * Calls to this function for a single hardware must be synchronized * against each other. Calls to this function, ieee80211_tx_status_ni() * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @status: tx status information */ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status); /** * ieee80211_tx_status_noskb - transmit status callback without skb * * This function can be used as a replacement for ieee80211_tx_status_skb() * in drivers that cannot reliably map tx status information back to * specific skbs. * * Calls to this function for a single hardware must be synchronized * against each other. Calls to this function, ieee80211_tx_status_ni() * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @sta: the receiver station to which this packet is sent * (NULL for multicast packets) * @info: tx status information */ static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, struct ieee80211_sta *sta, struct ieee80211_tx_info *info) { struct ieee80211_tx_status status = { .sta = sta, .info = info, }; ieee80211_tx_status_ext(hw, &status); } /** * ieee80211_tx_status_ni - transmit status callback (in process context) * * Like ieee80211_tx_status_skb() but can be called in process context. * * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_irqsafe() may not be mixed * for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw, struct sk_buff *skb) { local_bh_disable(); ieee80211_tx_status_skb(hw, skb); local_bh_enable(); } /** * ieee80211_tx_status_irqsafe - IRQ-safe transmit status callback * * Like ieee80211_tx_status_skb() but can be called in IRQ context * (internally defers to a tasklet.) * * Calls to this function, ieee80211_tx_status_skb() and * ieee80211_tx_status_ni() may not be mixed for a single hardware. * * @hw: the hardware the frame was transmitted by * @skb: the frame that was transmitted, owned by mac80211 after this call */ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); /** * ieee80211_report_low_ack - report non-responding station * * When operating in AP-mode, call this function to report a non-responding * connected STA. * * @sta: the non-responding connected sta * @num_packets: number of packets sent to @sta without a response */ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); #define IEEE80211_MAX_CNTDWN_COUNTERS_NUM 2 /** * struct ieee80211_mutable_offsets - mutable beacon offsets * @tim_offset: position of TIM element * @tim_length: size of TIM element * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; u16 mbssid_off; }; /** * ieee80211_beacon_get_template - beacon template generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon template. * * This function should be used if the beacon frames are generated by the * device, and then the driver must use the returned beacon as the template * The driver or the device are responsible to update the DTIM and, when * applicable, the CSA count. * * The driver is responsible for freeing the returned skb. * * Return: The beacon template. %NULL on error. */ struct sk_buff * ieee80211_beacon_get_template(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id); /** * ieee80211_beacon_get_template_ema_index - EMA beacon template generation * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP). * @ema_index: index of the beacon in the EMA set. * * This function follows the same rules as ieee80211_beacon_get_template() * but returns a beacon template which includes multiple BSSID element at the * requested index. * * Return: The beacon template. %NULL indicates the end of EMA templates. */ struct sk_buff * ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_mutable_offsets *offs, unsigned int link_id, u8 ema_index); /** * struct ieee80211_ema_beacons - List of EMA beacons * @cnt: count of EMA beacons. * * @bcn: array of EMA beacons. * @bcn.skb: the skb containing this specific beacon * @bcn.offs: &struct ieee80211_mutable_offsets pointer to struct that will * receive the offsets that may be updated by the driver. */ struct ieee80211_ema_beacons { u8 cnt; struct { struct sk_buff *skb; struct ieee80211_mutable_offsets offs; } bcn[]; }; /** * ieee80211_beacon_get_template_ema_list - EMA beacon template generation * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP) * * This function follows the same rules as ieee80211_beacon_get_template() * but allocates and returns a pointer to list of all beacon templates required * to cover all profiles in the multiple BSSID set. Each template includes only * one multiple BSSID element. * * Driver must call ieee80211_beacon_free_ema_list() to free the memory. * * Return: EMA beacon templates of type struct ieee80211_ema_beacons *. * %NULL on error. */ struct ieee80211_ema_beacons * ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_free_ema_list - free an EMA beacon template list * @ema_beacons: list of EMA beacons of type &struct ieee80211_ema_beacons pointers. * * This function will free a list previously acquired by calling * ieee80211_beacon_get_template_ema_list() */ void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons); /** * ieee80211_beacon_get_tim - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @tim_offset: pointer to variable that will receive the TIM IE offset. * Set to 0 if invalid (in non-AP modes). * @tim_length: pointer to variable that will receive the TIM IE length, * (including the ID and length bytes!). * Set to 0 if invalid (in non-AP modes). * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * If the driver implements beaconing modes, it must use this function to * obtain the beacon frame. * * If the beacon frames are generated by the host system (i.e., not in * hardware/firmware), the driver uses this function to get each beacon * frame from mac80211 -- it is responsible for calling this function exactly * once before the beacon is needed (e.g. based on hardware interrupt). * * The driver is responsible for freeing the returned skb. * * Return: The beacon template. %NULL on error. */ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length, unsigned int link_id); /** * ieee80211_beacon_get - beacon generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: the link id to which the beacon belongs (or 0 for an AP STA * that is not associated with AP MLD). * * See ieee80211_beacon_get_tim(). * * Return: See ieee80211_beacon_get_tim(). */ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id) { return ieee80211_beacon_get_tim(hw, vif, NULL, NULL, link_id); } /** * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when * ieee80211_beacon_get/ieee80211_beacon_get_tim are called, however if the * beacon frames are generated by the device, the driver should call this * function after each beacon transmission to sync mac80211's beacon countdown. * * Return: new countdown value */ u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @counter: the new value for the counter * * The beacon countdown can be changed by the device, this API should be * used by the device driver to update csa counter in mac80211. * * It should never be used together with ieee80211_beacon_update_cntdwn(), * as it will cause a race condition around the counter value. */ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * After a channel switch announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the channel can be changed. */ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * Return: %true if the countdown reached 1, %false otherwise */ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, unsigned int link_id); /** * ieee80211_color_change_finish - notify mac80211 about color change * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * * After a color change announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the color can be changed */ void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id); /** * ieee80211_proberesp_get - retrieve a Probe Response template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a Probe Response template which can, for example, be uploaded to * hardware. The destination address should be set by the caller. * * Can only be called in AP mode. * * Return: The Probe Response template. %NULL on error. */ struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_pspoll_get - retrieve a PS Poll template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a PS Poll a template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * AID, BSSID and MAC address is used. * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit. * * Return: The PS Poll template. %NULL on error. */ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: If the vif is an MLD, get a frame with the link addresses * for the given link ID. For a link_id < 0 you get a frame with * MLD addresses, however useful that might be. * @qos_ok: QoS NDP is acceptable to the caller, this should be set * if at all possible * * Creates a Nullfunc template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * BSSID and address is used. * * If @qos_ndp is set and the association is to an AP with QoS/WMM, the * returned packet will be QoS NDP. * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. * * Return: The nullfunc template. %NULL on error. */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int link_id, bool qos_ok); /** * ieee80211_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @src_addr: source MAC address * @ssid: SSID buffer * @ssid_len: length of SSID * @tailroom: tailroom to reserve at end of SKB for IEs * * Creates a Probe Request template which can, for example, be uploaded to * hardware. * * Return: The Probe Request template. %NULL on error. */ struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw, const u8 *src_addr, const u8 *ssid, size_t ssid_len, size_t tailroom); /** * ieee80211_rts_get - RTS frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame: pointer to the frame that is going to be protected by the RTS. * @frame_len: the frame length (in octets). * @frame_txctl: &struct ieee80211_tx_info of the frame. * @rts: The buffer where to store the RTS frame. * * If the RTS frames are generated by the host system (i.e., not in * hardware/firmware), the low-level driver uses this function to receive * the next RTS frame from the 802.11 code. The low-level is responsible * for calling this function before and RTS frame is needed. */ void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_rts *rts); /** * ieee80211_rts_duration - Get the duration field for an RTS frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame_len: the length of the frame that is going to be protected by the RTS. * @frame_txctl: &struct ieee80211_tx_info of the frame. * * If the RTS is generated in firmware, but the host system must provide * the duration field, the low-level driver uses this function to receive * the duration field value in little-endian byteorder. * * Return: The duration. */ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl); /** * ieee80211_ctstoself_get - CTS-to-self frame generation function * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame: pointer to the frame that is going to be protected by the CTS-to-self. * @frame_len: the frame length (in octets). * @frame_txctl: &struct ieee80211_tx_info of the frame. * @cts: The buffer where to store the CTS-to-self frame. * * If the CTS-to-self frames are generated by the host system (i.e., not in * hardware/firmware), the low-level driver uses this function to receive * the next CTS-to-self frame from the 802.11 code. The low-level is responsible * for calling this function before and CTS-to-self frame is needed. */ void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_info *frame_txctl, struct ieee80211_cts *cts); /** * ieee80211_ctstoself_duration - Get the duration field for a CTS-to-self frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @frame_len: the length of the frame that is going to be protected by the CTS-to-self. * @frame_txctl: &struct ieee80211_tx_info of the frame. * * If the CTS-to-self is generated in firmware, but the host system must provide * the duration field, the low-level driver uses this function to receive * the duration field value in little-endian byteorder. * * Return: The duration. */ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_info *frame_txctl); /** * ieee80211_generic_frame_duration - Calculate the duration field for a frame * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @band: the band to calculate the frame duration on * @frame_len: the length of the frame. * @rate: the rate at which the frame is going to be transmitted. * * Calculate the duration field of some generic frame, given its * length and transmission rate (in 100kbps). * * Return: The duration. */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate); /** * ieee80211_get_buffered_bc - accessing buffered broadcast and multicast frames * @hw: pointer as obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Function for accessing buffered broadcast and multicast frames. If * hardware/firmware does not implement buffering of broadcast/multicast * frames when power saving is used, 802.11 code buffers them in the host * memory. The low-level driver uses this function to fetch next buffered * frame. In most cases, this is used when generating beacon frame. * * Return: A pointer to the next buffered skb or NULL if no more buffered * frames are available. * * Note: buffered frames are returned only after DTIM beacon frame was * generated with ieee80211_beacon_get() and the low-level driver must thus * call ieee80211_beacon_get() first. ieee80211_get_buffered_bc() returns * NULL if the previous generated beacon was not DTIM, so the low-level driver * does not need to check for DTIM beacons separately and should be able to * use common code for all beacons. */ struct sk_buff * ieee80211_get_buffered_bc(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_get_tkip_p1k_iv - get a TKIP phase 1 key for IV32 * * This function returns the TKIP phase 1 key for the given IV32. * * @keyconf: the parameter passed with the set key * @iv32: IV32 to get the P1K for * @p1k: a buffer to which the key will be written, as 5 u16 values */ void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf, u32 iv32, u16 *p1k); /** * ieee80211_get_tkip_p1k - get a TKIP phase 1 key * * This function returns the TKIP phase 1 key for the IV32 taken * from the given packet. * * @keyconf: the parameter passed with the set key * @skb: the packet to take the IV32 value from that will be encrypted * with this P1K * @p1k: a buffer to which the key will be written, as 5 u16 values */ static inline void ieee80211_get_tkip_p1k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u16 *p1k) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; const u8 *data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control); u32 iv32 = get_unaligned_le32(&data[4]); ieee80211_get_tkip_p1k_iv(keyconf, iv32, p1k); } /** * ieee80211_get_tkip_rx_p1k - get a TKIP phase 1 key for RX * * This function returns the TKIP phase 1 key for the given IV32 * and transmitter address. * * @keyconf: the parameter passed with the set key * @ta: TA that will be used with the key * @iv32: IV32 to get the P1K for * @p1k: a buffer to which the key will be written, as 5 u16 values */ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, const u8 *ta, u32 iv32, u16 *p1k); /** * ieee80211_get_tkip_p2k - get a TKIP phase 2 key * * This function computes the TKIP RC4 key for the IV values * in the packet. * * @keyconf: the parameter passed with the set key * @skb: the packet to take the IV32/IV16 values from that will be * encrypted with this key * @p2k: a buffer to which the key will be written, 16 bytes */ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); /** * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos * * @pos: start of crypto header * @keyconf: the parameter passed with the set key * @pn: PN to add * * Returns: pointer to the octet following IVs (i.e. beginning of * the packet payload) * * This function writes the tkip IV value to pos (which should * point to the crypto header) */ u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); /** * ieee80211_get_key_rx_seq - get key RX sequence counter * * @keyconf: the parameter passed with the set key * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only); * the value on TID 0 is also used for non-QoS frames. For * CMAC, only TID 0 is valid. * @seq: buffer to receive the sequence data * * This function allows a driver to retrieve the current RX IV/PNs * for the given key. It must not be called if IV checking is done * by the device and not by mac80211. * * Note that this function may only be called when no RX processing * can be done concurrently. */ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** * ieee80211_set_key_rx_seq - set key RX sequence counter * * @keyconf: the parameter passed with the set key * @tid: The TID, or -1 for the management frame value (CCMP/GCMP only); * the value on TID 0 is also used for non-QoS frames. For * CMAC, only TID 0 is valid. * @seq: new sequence data * * This function allows a driver to set the current RX IV/PNs for the * given key. This is useful when resuming from WoWLAN sleep and GTK * rekey may have been done while suspended. It should not be called * if IV checking is done by the device and not by mac80211. * * Note that this function may only be called when no RX processing * can be done concurrently. */ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * * Context: Must be called with the wiphy mutex held. * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); /** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on * @keyconf: new key data * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new * key(s) will be available. These will be needed by mac80211 for proper * RX processing, so this function allows setting them. * * Return: the newly allocated key structure, which will have * similar contents to the passed key configuration but point to * mac80211-owned memory. In case of errors, the function returns an * ERR_PTR(), use IS_ERR() etc. * * Note that this function assumes the key isn't added to hardware * acceleration, so no TX will be done with the key. Since it's a GTK * on managed (station) networks, this is true anyway. If the driver * calls this function from the resume callback and subsequently uses * the return code 1 to reconfigure the device, this key will be part * of the reconfiguration. * * Note that the driver should also call ieee80211_set_key_rx_seq() * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, * then it will attempt to remove it during this call. In many cases * this isn't what you want, so call ieee80211_remove_key() first for * the key that's being replaced. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, struct ieee80211_key_conf *keyconf, int link_id); /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying * @vif: virtual interface the rekeying was done on * @bssid: The BSSID of the AP, for checking association * @replay_ctr: the new replay counter after GTK rekeying * @gfp: allocation flags */ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp); /** * ieee80211_key_mic_failure - increment MIC failure counter for the key * * Note: this is really only safe if no other RX function is called * at the same time. * * @keyconf: the key in question */ void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf); /** * ieee80211_key_replay - increment replay counter for the key * * Note: this is really only safe if no other RX function is called * at the same time. * * @keyconf: the key in question */ void ieee80211_key_replay(struct ieee80211_key_conf *keyconf); /** * ieee80211_wake_queue - wake specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_wake_queue. */ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); /** * ieee80211_stop_queue - stop specific queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_stop_queue. */ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); /** * ieee80211_queue_stopped - test status of the queue * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * * Drivers must use this function instead of netif_queue_stopped. * * Return: %true if the queue is stopped. %false otherwise. */ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue); /** * ieee80211_stop_queues - stop all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * * Drivers must use this function instead of netif_tx_stop_all_queues. */ void ieee80211_stop_queues(struct ieee80211_hw *hw); /** * ieee80211_wake_queues - wake all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * * Drivers must use this function instead of netif_tx_wake_all_queues. */ void ieee80211_wake_queues(struct ieee80211_hw *hw); /** * ieee80211_scan_completed - completed hardware scan * * When hardware scan offload is used (i.e. the hw_scan() callback is * assigned) this function needs to be called by the driver to notify * mac80211 that the scan finished. This function can be called from * any context, including hardirq context. * * @hw: the hardware that finished the scan * @info: information about the completed scan */ void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info); /** * ieee80211_sched_scan_results - got results from scheduled scan * * When a scheduled scan is running, this function needs to be called by the * driver whenever there are new scan results available. * * @hw: the hardware that is performing scheduled scans */ void ieee80211_sched_scan_results(struct ieee80211_hw *hw); /** * ieee80211_sched_scan_stopped - inform that the scheduled scan has stopped * * When a scheduled scan is running, this function can be called by * the driver if it needs to stop the scan to perform another task. * Usual scenarios are drivers that cannot continue the scheduled scan * while associating, for instance. * * @hw: the hardware that is performing scheduled scans */ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw); /** * enum ieee80211_interface_iteration_flags - interface iteration flags * @IEEE80211_IFACE_ITER_NORMAL: Iterate over all interfaces that have * been added to the driver; However, note that during hardware * reconfiguration (after restart_hw) it will iterate over a new * interface and over all the existing interfaces even if they * haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_RESUME_ALL: During resume, iterate over all * interfaces, even if they haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_ACTIVE: Iterate only active interfaces (netdev is up). * @IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER: Skip any interfaces where SDATA * is not in the driver. This may fix crashes during firmware recovery * for instance. */ enum ieee80211_interface_iteration_flags { IEEE80211_IFACE_ITER_NORMAL = 0, IEEE80211_IFACE_ITER_RESUME_ALL = BIT(0), IEEE80211_IFACE_ITER_ACTIVE = BIT(1), IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER = BIT(2), }; /** * ieee80211_iterate_interfaces - iterate interfaces * * This function iterates over the interfaces associated with a given * hardware and calls the callback for them. This includes active as well as * inactive interfaces. This function allows the iterator function to sleep. * Will iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call * @data: first argument of the iterator function */ void ieee80211_iterate_interfaces(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_active_interfaces - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This function allows the iterator function to sleep, when the iterator * function is atomic @ieee80211_iterate_active_interfaces_atomic can * be used. * Does not iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call * @data: first argument of the iterator function */ static inline void ieee80211_iterate_active_interfaces(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data) { ieee80211_iterate_interfaces(hw, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, iterator, data); } /** * ieee80211_iterate_active_interfaces_atomic - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This function requires the iterator callback function to be atomic, * if that is not desired, use @ieee80211_iterate_active_interfaces instead. * Does not iterate over a new interface during add_interface(). * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_active_interfaces_mtx - iterate active interfaces * * This function iterates over the interfaces associated with a given * hardware that are currently active and calls the callback for them. * This version can only be used while holding the wiphy mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iter_flags: iteration flags, see &enum ieee80211_interface_iteration_flags * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), void *data); /** * ieee80211_iterate_stations_atomic - iterate stations * * This function iterates over all stations associated with a given * hardware that are currently uploaded to the driver and calls the callback * function for them. * This function requires the iterator callback function to be atomic, * * @hw: the hardware struct of which the interfaces should be iterated over * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); /** * ieee80211_iterate_stations_mtx - iterate stations * * This function iterates over all stations associated with a given * hardware that are currently uploaded to the driver and calls the callback * function for them. This version can only be used while holding the wiphy * mutex. * * @hw: the hardware struct of which the interfaces should be iterated over * @iterator: the iterator function to call * @data: first argument of the iterator function */ void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, void (*iterator)(void *data, struct ieee80211_sta *sta), void *data); /** * ieee80211_queue_work - add work onto the mac80211 workqueue * * Drivers and mac80211 use this to add work onto the mac80211 workqueue. * This helper ensures drivers are not queueing work when they should not be. * * @hw: the hardware struct for the interface we are adding work for * @work: the work we want to add onto the mac80211 workqueue */ void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work); /** * ieee80211_queue_delayed_work - add work onto the mac80211 workqueue * * Drivers and mac80211 use this to queue delayed work onto the mac80211 * workqueue. * * @hw: the hardware struct for the interface we are adding work for * @dwork: delayable work to queue onto the mac80211 workqueue * @delay: number of jiffies to wait before queueing */ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, struct delayed_work *dwork, unsigned long delay); /** * ieee80211_refresh_tx_agg_session_timer - Refresh a tx agg session timer. * @sta: the station for which to start a BA session * @tid: the TID to BA on. * * This function allows low level driver to refresh tx agg session timer * to maintain BA session, the session level will still be managed by the * mac80211. * * Note: must be called in an RCU critical section. */ void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_start_tx_ba_session - Start a tx Block Ack session. * @sta: the station for which to start a BA session * @tid: the TID to BA on. * @timeout: session timeout value (in TUs) * * Return: success if addBA request was sent, failure otherwise * * Although mac80211/low level driver/user space application can estimate * the need to start aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ int ieee80211_start_tx_ba_session(struct ieee80211_sta *sta, u16 tid, u16 timeout); /** * ieee80211_start_tx_ba_cb_irqsafe - low level driver ready to aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback * @ra: receiver address of the BA session recipient. * @tid: the TID to BA on. * * This function must be called by low level driver once it has * finished with preparations for the BA session. It can be called * from any context. */ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); /** * ieee80211_stop_tx_ba_session - Stop a Block Ack session. * @sta: the station whose BA session to stop * @tid: the TID to stop BA. * * Return: negative error if the TID is invalid, or no aggregation active * * Although mac80211/low level driver/user space application can estimate * the need to stop aggregation on a certain RA/TID, the session level * will be managed by the mac80211. */ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *sta, u16 tid); /** * ieee80211_stop_tx_ba_cb_irqsafe - low level driver ready to stop aggregate. * @vif: &struct ieee80211_vif pointer from the add_interface callback * @ra: receiver address of the BA session recipient. * @tid: the desired TID to BA on. * * This function must be called by low level driver once it has * finished with preparations for the BA session tear down. It * can be called from any context. */ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, const u8 *ra, u16 tid); /** * ieee80211_find_sta - find a station * * @vif: virtual interface to look for station on * @addr: station's address * * Return: The station, if found. %NULL otherwise. * * Note: This function must be called under RCU lock and the * resulting pointer is only valid under RCU lock as well. */ struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr); /** * ieee80211_find_sta_by_ifaddr - find a station on hardware * * @hw: pointer as obtained from ieee80211_alloc_hw() * @addr: remote station's address * @localaddr: local address (vif->sdata->vif.addr). Use NULL for 'any'. * * Return: The station, if found. %NULL otherwise. * * Note: This function must be called under RCU lock and the * resulting pointer is only valid under RCU lock as well. * * NOTE: You may pass NULL for localaddr, but then you will just get * the first STA that matches the remote address 'addr'. * We can have multiple STA associated with multiple * logical stations (e.g. consider a station connecting to another * BSSID on the same AP hardware without disconnecting first). * In this case, the result of this method with localaddr NULL * is not reliable. * * DO NOT USE THIS FUNCTION with localaddr NULL if at all possible. */ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr); /** * ieee80211_find_sta_by_link_addrs - find STA by link addresses * @hw: pointer as obtained from ieee80211_alloc_hw() * @addr: remote station's link address * @localaddr: local link address, use %NULL for any (but avoid that) * @link_id: pointer to obtain the link ID if the STA is found, * may be %NULL if the link ID is not needed * * Obtain the STA by link address, must use RCU protection. * * Return: pointer to STA if found, otherwise %NULL. */ struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id); /** * ieee80211_sta_block_awake - block station from waking up * @hw: the hardware * @pubsta: the station * @block: whether to block or unblock * * Some devices require that all frames that are on the queues * for a specific station that went to sleep are flushed before * a poll response or frames after the station woke up can be * delivered to that it. Note that such frames must be rejected * by the driver as filtered, with the appropriate status flag. * * This function allows implementing this mode in a race-free * manner. * * To do this, a driver must keep track of the number of frames * still enqueued for a specific station. If this number is not * zero when the station goes to sleep, the driver must call * this function to force mac80211 to consider the station to * be asleep regardless of the station's actual state. Once the * number of outstanding frames reaches zero, the driver must * call this function again to unblock the station. That will * cause mac80211 to be able to send ps-poll responses, and if * the station queried in the meantime then frames will also * be sent out as a result of this. Additionally, the driver * will be notified that the station woke up some time after * it is unblocked, regardless of whether the station actually * woke up while blocked or not. */ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block); /** * ieee80211_sta_eosp - notify mac80211 about end of SP * @pubsta: the station * * When a device transmits frames in a way that it can't tell * mac80211 in the TX status about the EOSP, it must clear the * %IEEE80211_TX_STATUS_EOSP bit and call this function instead. * This applies for PS-Poll as well as uAPSD. * * Note that just like with _tx_status() and _rx() drivers must * not mix calls to irqsafe/non-irqsafe versions, this function * must not be mixed with those either. Use the all irqsafe, or * all non-irqsafe, don't mix! * * NB: the _irqsafe version of this function doesn't exist, no * driver needs it right now. Don't call this function if * you'd need the _irqsafe version, look at the git history * and restore the _irqsafe version! */ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); /** * ieee80211_send_eosp_nullfunc - ask mac80211 to send NDP with EOSP * @pubsta: the station * @tid: the tid of the NDP * * Sometimes the device understands that it needs to close * the Service Period unexpectedly. This can happen when * sending frames that are filling holes in the BA window. * In this case, the device can ask mac80211 to send a * Nullfunc frame with EOSP set. When that happens, the * driver must have called ieee80211_sta_set_buffered() to * let mac80211 know that there are no buffered frames any * more, otherwise mac80211 will get the more_data bit wrong. * The low level driver must have made sure that the frame * will be sent despite the station being in power-save. * Mac80211 won't call allow_buffered_frames(). * Note that calling this function, doesn't exempt the driver * from closing the EOSP properly, it will still have to call * ieee80211_sta_eosp when the NDP is sent. */ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); /** * ieee80211_sta_recalc_aggregates - recalculate aggregate data after a change * @pubsta: the station * * Call this function after changing a per-link aggregate data as referenced in * &struct ieee80211_sta_aggregates by accessing the agg field of * &struct ieee80211_link_sta. * * With non MLO the data in deflink will be referenced directly. In that case * there is no need to call this function. */ void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta); /** * ieee80211_sta_register_airtime - register airtime usage for a sta/tid * * Register airtime usage for a given sta on a given tid. The driver must call * this function to notify mac80211 that a station used a certain amount of * airtime. This information will be used by the TXQ scheduler to schedule * stations in a way that ensures airtime fairness. * * The reported airtime should as a minimum include all time that is spent * transmitting to the remote station, including overhead and padding, but not * including time spent waiting for a TXOP. If the time is not reported by the * hardware it can in some cases be calculated from the rate and known frame * composition. When possible, the time should include any failed transmission * attempts. * * The driver can either call this function synchronously for every packet or * aggregate, or asynchronously as airtime usage information becomes available. * TX and RX airtime can be reported together, or separately by setting one of * them to 0. * * @pubsta: the station * @tid: the TID to register airtime for * @tx_airtime: airtime used during TX (in usec) * @rx_airtime: airtime used during RX (in usec) */ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime); /** * ieee80211_txq_airtime_check - check if a txq can send frame to device * * @hw: pointer obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Return: %true if the AQL's airtime limit has not been reached and the txq can * continue to send more packets to the device. Otherwise return %false. */ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() * @vif: virtual interface to iterate, may be %NULL for all * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * * Context: Must be called with wiphy mutex held; can sleep. * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the * set_key callback. */ void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data); /** * ieee80211_iter_keys_rcu - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() * @vif: virtual interface to iterate, may be %NULL for all * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. Note that due to locking reasons, keys of station * in removal process will be skipped. * * This function requires being called in an RCU critical section, * and thus iter must be atomic. */ void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data); /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts * @hw: pointer obtained from ieee80211_alloc_hw(). * @iter: iterator function * @iter_data: data passed to iterator function * * Iterate all active channel contexts. This function is atomic and * doesn't acquire any locks internally that might be held in other * places while calling into the driver. * * The iterator will not find a context that's being added (during * the driver callback to add it) but will find it while it's being * removed. * * Note that during hardware restart, all contexts that existed * before the restart are considered already present so will be * found while iterating, whether they've been re-added already * or not. */ void ieee80211_iter_chan_contexts_atomic( struct ieee80211_hw *hw, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf, void *data), void *iter_data); /** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Creates a Probe Request template which can, for example, be uploaded to * hardware. The template is filled with bssid, ssid and supported rate * information. This function must only be called from within the * .bss_info_changed callback function and only in managed mode. The function * is only useful when the interface is associated, otherwise it will return * %NULL. * * Return: The Probe Request template. %NULL on error. */ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_beacon_loss - inform hardware does not receive beacons * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER and * %IEEE80211_CONF_PS is set, the driver needs to inform whenever the * hardware is not receiving beacons with this function. */ void ieee80211_beacon_loss(struct ieee80211_vif *vif); /** * ieee80211_connection_loss - inform hardware has lost connection to the AP * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * When beacon filtering is enabled with %IEEE80211_VIF_BEACON_FILTER, and * %IEEE80211_CONF_PS and %IEEE80211_HW_CONNECTION_MONITOR are set, the driver * needs to inform if the connection to the AP has been lost. * The function may also be called if the connection needs to be terminated * for some other reason, even if %IEEE80211_HW_CONNECTION_MONITOR isn't set. * * This function will cause immediate change to disassociated state, * without connection recovery attempts. */ void ieee80211_connection_loss(struct ieee80211_vif *vif); /** * ieee80211_disconnect - request disconnection * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @reconnect: immediate reconnect is desired * * Request disconnection from the current network and, if enabled, send a * hint to the higher layers that immediate reconnect is desired. */ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect); /** * ieee80211_resume_disconnect - disconnect from AP after resume * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instructs mac80211 to disconnect from the AP after resume. * Drivers can use this after WoWLAN if they know that the * connection cannot be kept up, for example because keys were * used while the device was asleep but the replay counters or * similar cannot be retrieved from the device during resume. * * Note that due to implementation issues, if the driver uses * the reconfiguration functionality during resume the interface * will still be added as associated first during resume and then * disconnect normally later. * * This function can only be called from the resume callback and * the driver must not be holding any of its own locks while it * calls this function, or at least not any locks it needs in the * key configuration paths (if it supports HW crypto). */ void ieee80211_resume_disconnect(struct ieee80211_vif *vif); /** * ieee80211_hw_restart_disconnect - disconnect from AP after * hardware restart * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instructs mac80211 to disconnect from the AP after * hardware restart. */ void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif); /** * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring * rssi threshold triggered * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @rssi_event: the RSSI trigger event type * @rssi_level: new RSSI level value or 0 if not available * @gfp: context flags * * When the %IEEE80211_VIF_SUPPORTS_CQM_RSSI is set, and a connection quality * monitoring is configured with an rssi threshold, the driver will inform * whenever the rssi level reaches the threshold. */ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp); /** * ieee80211_cqm_beacon_loss_notify - inform CQM of beacon loss * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @gfp: context flags */ void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp); /** * ieee80211_radar_detected - inform that a radar was detected * * @hw: pointer as obtained from ieee80211_alloc_hw() * @chanctx_conf: Channel context on which radar is detected. Mandatory to * pass a valid pointer during MLO. For non-MLO %NULL can be passed */ void ieee80211_radar_detected(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *chanctx_conf); /** * ieee80211_chswitch_done - Complete channel switch process * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @success: make the channel switch successful or not * @link_id: the link_id on which the switch was done. Ignored if success is * false. * * Complete the channel switch post-process: set the new operational channel * and wake up the suspended queues. */ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, unsigned int link_id); /** * ieee80211_channel_switch_disconnect - disconnect due to channel switch error * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * Instruct mac80211 to disconnect due to a channel switch error. The channel * switch can request to block the tx and so, we need to make sure we do not send * a deauth frame in this case. */ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif); /** * ieee80211_request_smps - request SM PS transition * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: link ID for MLO, or 0 * @smps_mode: new SM PS mode * * This allows the driver to request an SM PS transition in managed * mode. This is useful when the driver has more information than * the stack about possible interference, for example by bluetooth. */ void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, enum ieee80211_smps_mode smps_mode); /** * ieee80211_ready_on_channel - notification of remain-on-channel start * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_ready_on_channel(struct ieee80211_hw *hw); /** * ieee80211_remain_on_channel_expired - remain_on_channel duration expired * @hw: pointer as obtained from ieee80211_alloc_hw() */ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw); /** * ieee80211_stop_rx_ba_session - callback to stop existing BA sessions * * in order not to harm the system performance and user experience, the device * may request not to allow any rx ba session and tear down existing rx ba * sessions based on system constraints such as periodic BT activity that needs * to limit wlan activity (eg.sco or a2dp)." * in such cases, the intention is to limit the duration of the rx ppdu and * therefore prevent the peer device to use a-mpdu aggregation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @ba_rx_bitmap: Bit map of open rx ba per tid * @addr: & to bssid mac address */ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, const u8 *addr); /** * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered * @pubsta: station struct * @tid: the session's TID * @ssn: starting sequence number of the bitmap, all frames before this are * assumed to be out of the window after the call * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc. * @received_mpdus: number of received mpdus in firmware * * This function moves the BA window and releases all frames before @ssn, and * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, u16 received_mpdus); /** * ieee80211_send_bar - send a BlockAckReq frame * * can be used to flush pending frames from the peer's aggregation reorder * buffer. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @ra: the peer's destination address * @tid: the TID of the aggregation session * @ssn: the new starting sequence number for the receiver */ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn); /** * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr, unsigned int tid); /** * ieee80211_start_rx_ba_session_offl - start a Rx BA session * * Some device drivers may offload part of the Rx aggregation flow including * AddBa/DelBa negotiation but may otherwise be incapable of full Rx * reordering. * * Create structures responsible for reordering so device drivers may call here * when they complete AddBa negotiation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ static inline void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif, const u8 *addr, u16 tid) { if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; ieee80211_manage_rx_ba_offl(vif, addr, tid); } /** * ieee80211_stop_rx_ba_session_offl - stop a Rx BA session * * Some device drivers may offload part of the Rx aggregation flow including * AddBa/DelBa negotiation but may otherwise be incapable of full Rx * reordering. * * Destroy structures responsible for reordering so device drivers may call here * when they complete DelBa negotiation. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, const u8 *addr, u16 tid) { if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS); } /** * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout * * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx * buffer reording internally, and therefore also handle the session timer. * * Trigger the timeout flow, which sends a DelBa. * * @vif: &struct ieee80211_vif pointer from the add_interface callback * @addr: station mac address * @tid: the rx tid */ void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, const u8 *addr, unsigned int tid); /* Rate control API */ /** * struct ieee80211_tx_rate_control - rate control information for/from RC algo * * @hw: The hardware the algorithm is invoked for. * @sband: The band this frame is being transmitted on. * @bss_conf: the current BSS configuration * @skb: the skb that will be transmitted, the control information in it needs * to be filled in * @reported_rate: The rate control algorithm can fill this in to indicate * which rate should be reported to userspace as the current rate and * used for rate calculations in the mesh network. * @rts: whether RTS will be used for this frame because it is longer than the * RTS threshold * @short_preamble: whether mac80211 will request short-preamble transmission * if the selected rate supports it * @rate_idx_mask: user-requested (legacy) rate mask * @rate_idx_mcs_mask: user-requested MCS rate mask (NULL if not in use) * @bss: whether this frame is sent out in AP or IBSS mode */ struct ieee80211_tx_rate_control { struct ieee80211_hw *hw; struct ieee80211_supported_band *sband; struct ieee80211_bss_conf *bss_conf; struct sk_buff *skb; struct ieee80211_tx_rate reported_rate; bool rts, short_preamble; u32 rate_idx_mask; u8 *rate_idx_mcs_mask; bool bss; }; /** * enum rate_control_capabilities - rate control capabilities */ enum rate_control_capabilities { /** * @RATE_CTRL_CAPA_VHT_EXT_NSS_BW: * Support for extended NSS BW support (dot11VHTExtendedNSSCapable) * Note that this is only looked at if the minimum number of chains * that the AP uses is < the number of TX chains the hardware has, * otherwise the NSS difference doesn't bother us. */ RATE_CTRL_CAPA_VHT_EXT_NSS_BW = BIT(0), /** * @RATE_CTRL_CAPA_AMPDU_TRIGGER: * mac80211 should start A-MPDU sessions on tx */ RATE_CTRL_CAPA_AMPDU_TRIGGER = BIT(1), }; struct rate_control_ops { unsigned long capa; const char *name; void *(*alloc)(struct ieee80211_hw *hw); void (*add_debugfs)(struct ieee80211_hw *hw, void *priv, struct dentry *debugfsdir); void (*free)(void *priv); void *(*alloc_sta)(void *priv, struct ieee80211_sta *sta, gfp_t gfp); void (*rate_init)(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta); void (*rate_update)(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed); void (*free_sta)(void *priv, struct ieee80211_sta *sta, void *priv_sta); void (*tx_status_ext)(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st); void (*tx_status)(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, void *priv_sta, struct sk_buff *skb); void (*get_rate)(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc); void (*add_sta_debugfs)(void *priv, void *priv_sta, struct dentry *dir); u32 (*get_expected_throughput)(void *priv_sta); }; static inline int rate_supported(struct ieee80211_sta *sta, enum nl80211_band band, int index) { return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index)); } static inline s8 rate_lowest_index(struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { int i; for (i = 0; i < sband->n_bitrates; i++) if (rate_supported(sta, sband->band, i)) return i; /* warn when we cannot find a rate. */ WARN_ON_ONCE(1); /* and return 0 (the lowest index) */ return 0; } static inline bool rate_usable_index_exists(struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { unsigned int i; for (i = 0; i < sband->n_bitrates; i++) if (rate_supported(sta, sband->band, i)) return true; return false; } /** * rate_control_set_rates - pass the sta rate selection to mac80211/driver * * When not doing a rate control probe to test rates, rate control should pass * its rate selection to mac80211. If the driver supports receiving a station * rate table, it will use it to ensure that frames are always sent based on * the most recent rate control module decision. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @pubsta: &struct ieee80211_sta pointer to the target destination. * @rates: new tx rate set to be used for this station. * * Return: 0 on success. An error code otherwise. */ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates); int ieee80211_rate_control_register(const struct rate_control_ops *ops); void ieee80211_rate_control_unregister(const struct rate_control_ops *ops); static inline bool conf_is_ht20(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_20; } static inline bool conf_is_ht40_minus(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40 && conf->chandef.center_freq1 < conf->chandef.chan->center_freq; } static inline bool conf_is_ht40_plus(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40 && conf->chandef.center_freq1 > conf->chandef.chan->center_freq; } static inline bool conf_is_ht40(struct ieee80211_conf *conf) { return conf->chandef.width == NL80211_CHAN_WIDTH_40; } static inline bool conf_is_ht(struct ieee80211_conf *conf) { return (conf->chandef.width != NL80211_CHAN_WIDTH_5) && (conf->chandef.width != NL80211_CHAN_WIDTH_10) && (conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT); } static inline enum nl80211_iftype ieee80211_iftype_p2p(enum nl80211_iftype type, bool p2p) { if (p2p) { switch (type) { case NL80211_IFTYPE_STATION: return NL80211_IFTYPE_P2P_CLIENT; case NL80211_IFTYPE_AP: return NL80211_IFTYPE_P2P_GO; default: break; } } return type; } static inline enum nl80211_iftype ieee80211_vif_type_p2p(struct ieee80211_vif *vif) { return ieee80211_iftype_p2p(vif->type, vif->p2p); } /** * ieee80211_get_he_iftype_cap_vif - return HE capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * * Return: pointer to the struct ieee80211_sta_he_cap, or %NULL is none found */ static inline const struct ieee80211_sta_he_cap * ieee80211_get_he_iftype_cap_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_he_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_get_he_6ghz_capa_vif - return HE 6 GHz capabilities * @sband: the sband to search for the STA on * @vif: the vif to get the iftype from * * Return: the 6GHz capabilities */ static inline __le16 ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_he_6ghz_capa(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * * Return: pointer to the struct ieee80211_sta_eht_cap, or %NULL is none found */ static inline const struct ieee80211_sta_eht_cap * ieee80211_get_eht_iftype_cap_vif(const struct ieee80211_supported_band *sband, struct ieee80211_vif *vif) { return ieee80211_get_eht_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); } /** * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data * * @vif: the specified virtual interface * @link_id: the link ID for MLO, otherwise 0 * @membership: 64 bits array - a bit is set if station is member of the group * @position: 2 bits per group id indicating the position in the group * * Note: This function assumes that the given vif is valid and the position and * membership data is of the correct size and are in the same byte order as the * matching GroupId management frame. * Calls to this function need to be serialized with RX path. */ void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position); void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold); void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); /** * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ int ieee80211_ave_rssi(struct ieee80211_vif *vif); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup * @vif: virtual interface * @wakeup: wakeup reason(s) * @gfp: allocation flags * * See cfg80211_report_wowlan_wakeup(). */ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, struct cfg80211_wowlan_wakeup *wakeup, gfp_t gfp); /** * ieee80211_tx_prepare_skb - prepare an 802.11 skb for transmission * @hw: pointer as obtained from ieee80211_alloc_hw() * @vif: virtual interface * @skb: frame to be sent from within the driver * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * * Return: %true if the skb was prepared, %false otherwise * * Note: must be called under RCU lock */ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta); /** * ieee80211_parse_tx_radiotap - Sanity-check and parse the radiotap header * of injected frames. * * To accurately parse and take into account rate and retransmission fields, * you must initialize the chandef field in the ieee80211_tx_info structure * of the skb before calling this function. * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device * * Return: %true if the radiotap header was parsed, %false otherwise */ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev); /** * struct ieee80211_noa_data - holds temporary data for tracking P2P NoA state * * @next_tsf: TSF timestamp of the next absent state change * @has_next_tsf: next absent state change event pending * * @absent: descriptor bitmask, set if GO is currently absent * * private: * * @count: count fields from the NoA descriptors * @desc: adjusted data from the NoA */ struct ieee80211_noa_data { u32 next_tsf; bool has_next_tsf; u8 absent; u8 count[IEEE80211_P2P_NOA_DESC_MAX]; struct { u32 start; u32 duration; u32 interval; } desc[IEEE80211_P2P_NOA_DESC_MAX]; }; /** * ieee80211_parse_p2p_noa - initialize NoA tracking data from P2P IE * * @attr: P2P NoA IE * @data: NoA tracking data * @tsf: current TSF timestamp * * Return: number of successfully parsed descriptors */ int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, struct ieee80211_noa_data *data, u32 tsf); /** * ieee80211_update_p2p_noa - get next pending P2P GO absent state change * * @data: NoA tracking data * @tsf: current TSF timestamp */ void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf); /** * ieee80211_tdls_oper_request - request userspace to perform a TDLS operation * @vif: virtual interface * @peer: the peer's destination address * @oper: the requested TDLS operation * @reason_code: reason code for the operation, valid for TDLS teardown * @gfp: allocation flags * * See cfg80211_tdls_oper_request(). */ void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp); /** * ieee80211_reserve_tid - request to reserve a specific TID * * There is sometimes a need (such as in TDLS) for blocking the driver from * using a specific TID so that the FW can use it for certain operations such * as sending PTI requests. To make sure that the driver doesn't use that TID, * this function must be called as it flushes out packets on this TID and marks * it as blocked, so that any transmit for the station on this TID will be * redirected to the alternative TID in the same AC. * * Note that this function blocks and may call back into the driver, so it * should be called without driver locks held. Also note this function should * only be called from the driver's @sta_state callback. * * @sta: the station to reserve the TID for * @tid: the TID to reserve * * Returns: 0 on success, else on failure */ int ieee80211_reserve_tid(struct ieee80211_sta *sta, u8 tid); /** * ieee80211_unreserve_tid - request to unreserve a specific TID * * Once there is no longer any need for reserving a certain TID, this function * should be called, and no longer will packets have their TID modified for * preventing use of this TID in the driver. * * Note that this function blocks and acquires a lock, so it should be called * without driver locks held. Also note this function should only be called * from the driver's @sta_state callback. * * @sta: the station * @tid: the TID to unreserve */ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); /** * ieee80211_tx_dequeue - dequeue a packet from a software tx queue * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * * Return: the skb if successful, %NULL if no frame was available. * * Note that this must be called in an rcu_read_lock() critical section, * which can only be released after the SKB was handled. Some pointers in * skb->cb, e.g. the key pointer, are protected by RCU and thus the * critical section must persist not just for the duration of this call * but for the duration of the frame handling. * However, also note that while in the wake_tx_queue() method, * rcu_read_lock() is already held. * * softirqs must also be disabled when this function is called. * In process context, use ieee80211_tx_dequeue_ni() instead. */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_tx_dequeue_ni - dequeue a packet from a software tx queue * (in process context) * * Like ieee80211_tx_dequeue() but can be called in process context * (internally disables bottom halves). * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * * Return: the skb if successful, %NULL if no frame was available. */ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { struct sk_buff *skb; local_bh_disable(); skb = ieee80211_tx_dequeue(hw, txq); local_bh_enable(); return skb; } /** * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback * * @hw: pointer as obtained from wake_tx_queue() callback(). * @txq: pointer as obtained from wake_tx_queue() callback(). * * Drivers can use this function for the mandatory mac80211 wake_tx_queue * callback in struct ieee80211_ops. They should not call this function. */ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_next_txq - get next tx queue to pull packets from * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to return packets from. * * Return: the next txq if successful, %NULL if no queue is eligible. If a txq * is returned, it should be returned with ieee80211_return_txq() after the * driver has finished scheduling it. */ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac); /** * ieee80211_txq_schedule_start - start new scheduling round for TXQs * * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to acquire locks for * * Should be called before ieee80211_next_txq() or ieee80211_return_txq(). * The driver must not call multiple TXQ scheduling rounds concurrently. */ void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac); /* (deprecated) */ static inline void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac) { } void __ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force); /** * ieee80211_schedule_txq - schedule a TXQ for transmission * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Schedules a TXQ for transmission if it is not already scheduled, * even if mac80211 does not have any packets buffered. * * The driver may call this function if it has buffered packets for * this TXQ internally. */ static inline void ieee80211_schedule_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq) { __ieee80211_schedule_txq(hw, txq, true); } /** * ieee80211_return_txq - return a TXQ previously acquired by ieee80211_next_txq() * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * @force: schedule txq even if mac80211 does not have any buffered packets. * * The driver may set force=true if it has buffered packets for this TXQ * internally. */ static inline void ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, bool force) { __ieee80211_schedule_txq(hw, txq, force); } /** * ieee80211_txq_may_transmit - check whether TXQ is allowed to transmit * * This function is used to check whether given txq is allowed to transmit by * the airtime scheduler, and can be used by drivers to access the airtime * fairness accounting without using the scheduling order enforced by * next_txq(). * * Returns %true if the airtime scheduler thinks the TXQ should be allowed to * transmit, and %false if it should be throttled. This function can also have * the side effect of rotating the TXQ in the scheduler rotation, which will * eventually bring the deficit to positive and allow the station to transmit * again. * * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be * aligned against driver's own round-robin scheduler list. i.e it rotates * the TXQ list till it makes the requested node becomes the first entry * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this * function returns %true, the driver is expected to schedule packets * for transmission, and then return the TXQ through ieee80211_return_txq(). * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * * Return: %true if transmission is allowed, %false otherwise */ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq); /** * ieee80211_txq_get_depth - get pending frame/byte count of given txq * * The values are not guaranteed to be coherent with regard to each other, i.e. * txq state can change half-way of this function and the caller may end up * with "new" frame_cnt and "old" byte_cnt or vice-versa. * * @txq: pointer obtained from station or virtual interface * @frame_cnt: pointer to store frame count * @byte_cnt: pointer to store byte count */ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *frame_cnt, unsigned long *byte_cnt); /** * ieee80211_nan_func_terminated - notify about NAN function termination. * * This function is used to notify mac80211 about NAN function termination. * Note that this function can't be called from hard irq. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @inst_id: the local instance id * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*) * @gfp: allocation flags */ void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp); /** * ieee80211_nan_func_match - notify about NAN function match event. * * This function is used to notify mac80211 about NAN function match. The * cookie inside the match struct will be assigned by mac80211. * Note that this function can't be called from hard irq. * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @match: match event information * @gfp: allocation flags */ void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp); /** * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX. * * This function calculates the estimated airtime usage of a frame based on the * rate information in the RX status struct and the frame length. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @status: &struct ieee80211_rx_status containing the transmission rate * information. * @len: frame length in bytes * * Return: the airtime estimate */ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, int len); /** * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX. * * This function calculates the estimated airtime usage of a frame based on the * rate information in the TX info struct and the frame length. * * @hw: pointer as obtained from ieee80211_alloc_hw() * @info: &struct ieee80211_tx_info of the frame. * @len: frame length in bytes * * Return: the airtime estimate */ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int len); /** * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * The driver is responsible for freeing the returned skb. * * Return: FILS discovery template. %NULL on error. */ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast * probe response template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. * * The driver is responsible for freeing the returned skb. * * Return: Unsolicited broadcast probe response template. %NULL on error. */ struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color * collision. * @link_id: valid link_id during MLO or 0 for non-MLO * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is * aware of. */ void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, u8 link_id); /** * ieee80211_is_tx_data - check if frame is a data frame * * The function is used to check if a frame is a data frame. Frames with * hardware encapsulation enabled are data frames. * * @skb: the frame to be transmitted. * * Return: %true if @skb is a data frame, %false otherwise */ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; return info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP || ieee80211_is_data(hdr->frame_control); } /** * ieee80211_set_active_links - set active links in client mode * @vif: interface to set active links on * @active_links: the new active links bitmap * * Context: Must be called with wiphy mutex held; may sleep; calls * back into the driver. * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. * * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) * - assign_vif_chanctx(link_id=4) * - change_sta_links(0x11) for each affected STA (the AP) * (TDLS connections on now inactive links should be torn down) * - remove group keys on the old link (link_id 0) * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) * - change_sta_links(0x10) for each affected STA (the AP) * - change_vif_links(0x10) * * Return: 0 on success. An error code otherwise. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); /** * ieee80211_set_active_links_async - asynchronously set active links * @vif: interface to set active links on * @active_links: the new active links bitmap * * See ieee80211_set_active_links() for more information, the only * difference here is that the link change is triggered async and * can be called in any context, but the link switch will only be * completed after it returns. */ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links); /** * ieee80211_send_teardown_neg_ttlm - tear down a negotiated TTLM request * @vif: the interface on which the tear down request should be sent. * * This function can be used to tear down a previously accepted negotiated * TTLM request. */ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); /** * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth * @width: the channel width value to convert * Return: the STA RX bandwidth value for the channel width */ static inline enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { switch (width) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_STA_RX_BW_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_STA_RX_BW_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_STA_RX_BW_80; case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: return IEEE80211_STA_RX_BW_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_STA_RX_BW_320; } } /** * ieee80211_prepare_rx_omi_bw - prepare for sending BW RX OMI * @link_sta: the link STA the OMI is going to be sent to * @bw: the bandwidth requested * * When the driver decides to do RX OMI to change bandwidth with a STA * it calls this function to prepare, then sends the OMI, and finally * calls ieee80211_finalize_rx_omi_bw(). * * Note that the (link) STA rate control is updated accordingly as well, * but the chanctx might not be updated if there are other users. * If the intention is to reduce the listen bandwidth, the driver must * ensure there are no TDLS stations nor other uses of the chanctx. * * Also note that in order to sequence correctly, narrowing bandwidth * will only happen in ieee80211_finalize_rx_omi_bw(), whereas widening * again (e.g. going back to normal) will happen here. * * Note that we treat this symmetrically, so if the driver calls this * and tells the peer to only send with a lower bandwidth, we assume * that the driver also wants to only send at that lower bandwidth, to * allow narrowing of the chanctx request for this station/interface. * * Finally, the driver must ensure that if the function returned %true, * ieee80211_finalize_rx_omi_bw() is also called, even for example in * case of HW restart. * * Context: Must be called with wiphy mutex held, and will call back * into the driver, so ensure no driver locks are held. * * Return: %true if changes are going to be made, %false otherwise */ bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *link_sta, enum ieee80211_sta_rx_bandwidth bw); /** * ieee80211_finalize_rx_omi_bw - finalize BW RX OMI update * @link_sta: the link STA the OMI was sent to * * See ieee80211_client_prepare_rx_omi_bw(). Context is the same here * as well. */ void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *link_sta); /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx, u32 changed); int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode); #endif /* MAC80211_H */
138 139 139 45 138 20 20 120 108 108 108 108 102 108 107 18 18 18 18 18 104 84 20 104 104 106 38 104 104 68 20 20 102 103 103 103 103 103 102 103 103 103 103 103 103 103 91 91 4 4 4 4 4 4 4 4 68 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bcachefs_ioctl.h" #include "btree_cache.h" #include "btree_journal_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" #include "compress.h" #include "disk_accounting.h" #include "error.h" #include "journal_io.h" #include "replicas.h" /* * Notes on disk accounting: * * We have two parallel sets of counters to be concerned with, and both must be * kept in sync. * * - Persistent/on disk accounting, stored in the accounting btree and updated * via btree write buffer updates that treat new accounting keys as deltas to * apply to existing values. But reading from a write buffer btree is * expensive, so we also have * * - In memory accounting, where accounting is stored as an array of percpu * counters, indexed by an eytzinger array of disk acounting keys/bpos (which * are the same thing, excepting byte swabbing on big endian). * * Cheap to read, but non persistent. * * Disk accounting updates are generated by transactional triggers; these run as * keys enter and leave the btree, and can compare old and new versions of keys; * the output of these triggers are deltas to the various counters. * * Disk accounting updates are done as btree write buffer updates, where the * counters in the disk accounting key are deltas that will be applied to the * counter in the btree when the key is flushed by the write buffer (or journal * replay). * * To do a disk accounting update: * - initialize a disk_accounting_pos, to specify which counter is being update * - initialize counter deltas, as an array of 1-3 s64s * - call bch2_disk_accounting_mod() * * This queues up the accounting update to be done at transaction commit time. * Underneath, it's a normal btree write buffer update. * * The transaction commit path is responsible for propagating updates to the in * memory counters, with bch2_accounting_mem_mod(). * * The commit path also assigns every disk accounting update a unique version * number, based on the journal sequence number and offset within that journal * buffer; this is used by journal replay to determine which updates have been * done. * * The transaction commit path also ensures that replicas entry accounting * updates are properly marked in the superblock (so that we know whether we can * mount without data being unavailable); it will update the superblock if * bch2_accounting_mem_mod() tells it to. */ static const char * const disk_accounting_type_strs[] = { #define x(t, n, ...) [n] = #t, BCH_DISK_ACCOUNTING_TYPES() #undef x NULL }; static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, s64 *d, unsigned nr) { struct bkey_i_accounting *acc = bkey_accounting_init(k); acc->k.p = disk_accounting_pos_to_bpos(pos); set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); memcpy_u64s_small(acc->v.d, d, nr); } static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) { /* Normalize: */ switch (k->type) { case BCH_DISK_ACCOUNTING_replicas: bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); break; } BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, k, d, nr); if (unlikely(gc)) { int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ret = drop_locks_do(trans, bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); return ret; } else { return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); } } int bch2_mod_dev_cached_sectors(struct btree_trans *trans, unsigned dev, s64 sectors, bool gc) { struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_replicas, }; bch2_replicas_entry_cached(&acc.replicas, dev); return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc); } static inline bool is_zero(char *start, char *end) { BUG_ON(start > end); for (; start < end; start++) if (*start) return false; return true; } #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); void *end = &acc_k + 1; int ret = 0; bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); switch (acc_k.type) { case BCH_DISK_ACCOUNTING_nr_inodes: end = field_end(acc_k, nr_inodes); break; case BCH_DISK_ACCOUNTING_persistent_reserved: end = field_end(acc_k, persistent_reserved); break; case BCH_DISK_ACCOUNTING_replicas: bkey_fsck_err_on(!acc_k.replicas.nr_devs, c, accounting_key_replicas_nr_devs_0, "accounting key replicas entry with nr_devs=0"); bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || (acc_k.replicas.nr_required > 1 && acc_k.replicas.nr_required == acc_k.replicas.nr_devs), c, accounting_key_replicas_nr_required_bad, "accounting key replicas entry with bad nr_required"); for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], c, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs"); end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); break; case BCH_DISK_ACCOUNTING_dev_data_type: end = field_end(acc_k, dev_data_type); break; case BCH_DISK_ACCOUNTING_compression: end = field_end(acc_k, compression); break; case BCH_DISK_ACCOUNTING_snapshot: end = field_end(acc_k, snapshot); break; case BCH_DISK_ACCOUNTING_btree: end = field_end(acc_k, btree); break; case BCH_DISK_ACCOUNTING_rebalance_work: end = field_end(acc_k, rebalance_work); break; } bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), c, accounting_key_junk_at_end, "junk at end of accounting key"); fsck_err: return ret; } void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) { if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { prt_printf(out, "unknown type %u", k->type); return; } prt_str(out, disk_accounting_type_strs[k->type]); prt_str(out, " "); switch (k->type) { case BCH_DISK_ACCOUNTING_nr_inodes: break; case BCH_DISK_ACCOUNTING_persistent_reserved: prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); break; case BCH_DISK_ACCOUNTING_replicas: bch2_replicas_entry_to_text(out, &k->replicas); break; case BCH_DISK_ACCOUNTING_dev_data_type: prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); bch2_prt_data_type(out, k->dev_data_type.data_type); break; case BCH_DISK_ACCOUNTING_compression: bch2_prt_compression_type(out, k->compression.type); break; case BCH_DISK_ACCOUNTING_snapshot: prt_printf(out, "id=%u", k->snapshot.id); break; case BCH_DISK_ACCOUNTING_btree: prt_str(out, "btree="); bch2_btree_id_to_text(out, k->btree.id); break; } } void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); bch2_accounting_key_to_text(out, &acc_k); for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) prt_printf(out, " %lli", acc.v->d[i]); } void bch2_accounting_swab(struct bkey_s k) { for (u64 *p = (u64 *) k.v; p < (u64 *) bkey_val_end(k); p++) *p = swab64(*p); } static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct disk_accounting_pos *acc) { unsafe_memcpy(r, &acc->replicas, replicas_entry_bytes(&acc->replicas), "variable length struct"); } static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, p); switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: __accounting_to_replicas(r, &acc_k); return true; default: return false; } } static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { struct bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; } /* * Ensure accounting keys being updated are present in the superblock, when * applicable (i.e. replicas updates) */ int bch2_accounting_update_sb(struct btree_trans *trans) { for (struct jset_entry *i = trans->journal_entries; i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); i = vstruct_next(i)) if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); if (ret) return ret; } return 0; } static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) { struct bch_accounting_mem *acc = &c->accounting; /* raced with another insert, already present: */ if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p) < acc->k.nr) return 0; struct accounting_mem_entry n = { .pos = a.k->p, .bversion = a.k->bversion, .nr_counters = bch2_accounting_counters(a.k), .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL), }; if (!n.v[0]) goto err; if (acc->gc_running) { n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL); if (!n.v[1]) goto err; } if (darray_push(&acc->k, n)) goto err; eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); if (trace_accounting_mem_insert_enabled()) { struct printbuf buf = PRINTBUF; bch2_accounting_to_text(&buf, c, a.s_c); trace_accounting_mem_insert(c, buf.buf); printbuf_exit(&buf); } return 0; err: free_percpu(n.v[1]); free_percpu(n.v[0]); return -BCH_ERR_ENOMEM_disk_accounting; } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { struct bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) return -BCH_ERR_btree_insert_need_mark_replicas; percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); int ret = __bch2_accounting_mem_insert(c, a); percpu_up_write(&c->mark_lock); percpu_down_read(&c->mark_lock); return ret; } static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) { for (unsigned i = 0; i < e->nr_counters; i++) if (percpu_u64_get(e->v[0] + i) || (e->v[1] && percpu_u64_get(e->v[1] + i))) return false; return true; } void bch2_accounting_mem_gc(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; percpu_down_write(&c->mark_lock); struct accounting_mem_entry *dst = acc->k.data; darray_for_each(acc->k, src) { if (accounting_mem_entry_is_zero(src)) { free_percpu(src->v[0]); free_percpu(src->v[1]); } else { *dst++ = *src; } } acc->k.nr = dst - acc->k.data; eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); percpu_up_write(&c->mark_lock); } /* * Read out accounting keys for replicas entries, as an array of * bch_replicas_usage entries. * * Note: this may be deprecated/removed at smoe point in the future and replaced * with something more general, it exists to support the ioctl used by the * 'bcachefs fs usage' command. */ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) { struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(usage); percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { struct { struct bch_replicas_usage r; u8 pad[BCH_BKEY_PTRS_MAX]; } u; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; u64 sectors; bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false); u.r.sectors = sectors; ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); if (ret) break; memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); usage->nr += replicas_usage_bytes(&u.r); } percpu_up_read(&c->mark_lock); if (ret) darray_exit(usage); return ret; } int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) { struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(out_buf); percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { struct disk_accounting_pos a_p; bpos_to_disk_accounting_pos(&a_p, i->pos); if (!(accounting_types_mask & BIT(a_p.type))) continue; ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + sizeof(u64) * i->nr_counters); if (ret) break; struct bkey_i_accounting *a_out = bkey_accounting_init((void *) &darray_top(*out_buf)); set_bkey_val_u64s(&a_out->k, i->nr_counters); a_out->k.p = i->pos; bch2_accounting_mem_read_counters(acc, i - acc->k.data, a_out->v.d, i->nr_counters, false); if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) out_buf->nr += bkey_bytes(&a_out->k); } percpu_up_read(&c->mark_lock); if (ret) darray_exit(out_buf); return ret; } static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { darray_for_each(acc->k, e) { free_percpu(e->v[gc]); e->v[gc] = NULL; } } int bch2_gc_accounting_start(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; int ret = 0; percpu_down_write(&c->mark_lock); darray_for_each(acc->k, e) { e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL); if (!e->v[1]) { bch2_accounting_free_counters(acc, true); ret = -BCH_ERR_ENOMEM_disk_accounting; break; } } acc->gc_running = !ret; percpu_up_write(&c->mark_lock); return ret; } int bch2_gc_accounting_done(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; struct bpos pos = POS_MIN; int ret = 0; percpu_down_write(&c->mark_lock); while (1) { unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &pos); if (idx >= acc->k.nr) break; struct accounting_mem_entry *e = acc->k.data + idx; pos = bpos_successor(e->pos); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, e->pos); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) continue; u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; unsigned nr = e->nr_counters; bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); if (memcmp(dst_v, src_v, nr * sizeof(u64))) { printbuf_reset(&buf); prt_str(&buf, "accounting mismatch for "); bch2_accounting_key_to_text(&buf, &acc_k); prt_str(&buf, ": got"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", dst_v[j]); prt_str(&buf, " should be"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", src_v[j]); for (unsigned j = 0; j < nr; j++) src_v[j] -= dst_v[j]; if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { percpu_up_write(&c->mark_lock); ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); percpu_down_write(&c->mark_lock); if (ret) goto err; if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), BCH_ACCOUNTING_normal); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); struct bch_fs_usage_base *src = &trans->fs_usage_delta; acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); preempt_enable(); } } } } err: fsck_err: percpu_up_write(&c->mark_lock); printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; if (k.k->type != KEY_TYPE_accounting) return 0; percpu_down_read(&c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), BCH_ACCOUNTING_read); percpu_up_read(&c->mark_lock); return ret; } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, struct disk_accounting_pos acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0, invalid_dev = -1; switch (acc.type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; __accounting_to_replicas(&r.e, &acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && !bch2_dev_exists(c, r.e.devs[i])) { invalid_dev = r.e.devs[i]; goto invalid_device; } /* * All replicas entry checks except for invalid device are done * in bch2_accounting_validate */ BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n %s", (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping * and retaking lock is ok: */ percpu_up_write(&c->mark_lock); ret = bch2_mark_replicas(c, &r.e); if (ret) goto fsck_err; percpu_down_write(&c->mark_lock); } break; } case BCH_DISK_ACCOUNTING_dev_data_type: if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { invalid_dev = acc.dev_data_type.dev; goto invalid_device; } break; } fsck_err: printbuf_exit(&buf); return ret; invalid_device: if (fsck_err(trans, accounting_to_invalid_device, "accounting entry points to invalid device %i\n %s", invalid_dev, (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { ret = -BCH_ERR_remove_disk_accounting_entry; } goto fsck_err; } /* * At startup time, initialize the in memory accounting from the btree (and * journal) */ int bch2_accounting_read(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; /* * We might run more than once if we rewind to start topology repair or * btree node scan - and those might cause us to get different results, * so we can't just skip if we've already run. * * Instead, zero out any accounting we have: */ percpu_down_write(&c->mark_lock); darray_for_each(acc->k, e) percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); for_each_member_device(c, ca) percpu_memset(ca->usage, 0, sizeof(*ca->usage)); percpu_memset(c->usage, 0, sizeof(*c->usage)); percpu_up_write(&c->mark_lock); struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); iter.flags &= ~BTREE_ITER_with_journal; int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); if (k.k->type != KEY_TYPE_accounting) continue; struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; if (!bch2_accounting_is_mem(acc_k)) { struct disk_accounting_pos next = { .type = acc_k.type + 1 }; bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; } accounting_read_key(trans, k); })); if (ret) goto err; struct journal_keys *keys = &c->journal_keys; struct journal_key *dst = keys->data; move_gap(keys, keys->nr); darray_for_each(*keys, i) { if (i->k->k.type == KEY_TYPE_accounting) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); if (!bch2_accounting_is_mem(acc_k)) continue; struct bkey_s_c k = bkey_i_to_s_c(i->k); unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &k.k->p); bool applied = idx < acc->k.nr && bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; if (applied) continue; if (i + 1 < &darray_top(*keys) && i[1].k->k.type == KEY_TYPE_accounting && !journal_key_cmp(i, i + 1)) { WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); i[1].journal_seq = i[0].journal_seq; bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), bkey_s_c_to_accounting(k)); continue; } ret = accounting_read_key(trans, k); if (ret) goto err; } *dst++ = *i; } keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; memset(v, 0, sizeof(v)); for (unsigned j = 0; j < i->nr_counters; j++) v[j] = percpu_u64_get(i->v[0] + j); /* * If the entry counters are zeroed, it should be treated as * nonexistent - it might point to an invalid device. * * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(i->v[0]); free_percpu(i->v[1]); darray_remove_item(&acc->k, i); ret = 0; continue; } if (ret) goto fsck_err; } eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); for (unsigned i = 0; i < acc->k.nr; i++) { struct disk_accounting_pos k; bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); switch (k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: usage->reserved += v[0] * k.persistent_reserved.nr_replicas; break; case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); percpu_u64_set(&d->sectors, v[1]); percpu_u64_set(&d->fragmented, v[2]); if (k.dev_data_type.data_type == BCH_DATA_sb || k.dev_data_type.data_type == BCH_DATA_journal) usage->hidden += v[0] * ca->mi.bucket_size; } rcu_read_unlock(); break; } } preempt_enable(); fsck_err: percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) { return bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ struct disk_accounting_pos acc; bpos_to_disk_accounting_pos(&acc, k.k->p); acc.type == BCH_DISK_ACCOUNTING_dev_data_type && acc.dev_data_type.dev == dev ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) : 0; })) ?: bch2_btree_write_buffer_flush_sync(trans)); } int bch2_dev_usage_init(struct bch_dev *ca, bool gc) { struct bch_fs *c = ca->fs; struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_dev_data_type, .dev_data_type.dev = ca->dev_idx, .dev_data_type.data_type = BCH_DATA_free, }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; int ret = bch2_trans_do(c, ({ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); })); bch_err_fn(c, ret); return ret; } void bch2_verify_accounting_clean(struct bch_fs *c) { bool mismatch = false; struct bch_fs_usage_base base = {}, base_inmem = {}; bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_all_snapshots, k, ({ u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); unsigned nr = bch2_accounting_counters(k.k); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; if (!bch2_accounting_is_mem(acc_k)) { struct disk_accounting_pos next = { .type = acc_k.type + 1 }; bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; } bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, " !="); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", v[j]); pr_err("%s", buf.buf); printbuf_exit(&buf); mismatch = true; } switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; break; case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; } v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); rcu_read_unlock(); if (memcmp(a.v->d, v, 3 * sizeof(u64))) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, " in mem"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", v[j]); pr_err("dev accounting mismatch: %s", buf.buf); printbuf_exit(&buf); mismatch = true; } } } 0; }))); acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); #define check(x) \ if (base.x != base_inmem.x) { \ pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ mismatch = true; \ } //check(hidden); check(btree); check(data); check(cached); check(reserved); check(nr_inodes); WARN_ON(mismatch); } void bch2_accounting_gc_free(struct bch_fs *c) { lockdep_assert_held(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; bch2_accounting_free_counters(acc, true); acc->gc_running = false; } void bch2_fs_accounting_exit(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; bch2_accounting_free_counters(acc, false); darray_exit(&acc->k); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 /* SPDX-License-Identifier: GPL-2.0 */ /* * IRQ subsystem internal functions and variables: * * Do not ever include this file from anything else than * kernel/irq/. Do not even think about using any information outside * of this file for your non core code. */ #include <linux/irqdesc.h> #include <linux/kernel_stat.h> #include <linux/pm_runtime.h> #include <linux/sched/clock.h> #ifdef CONFIG_SPARSE_IRQ # define MAX_SPARSE_IRQS INT_MAX #else # define MAX_SPARSE_IRQS NR_IRQS #endif #define istate core_internal_state__do_not_mess_with_it extern bool noirqdebug; extern struct irqaction chained_action; /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded * IRQTF_READY - signals that irq thread is ready */ enum { IRQTF_RUNTHREAD, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, IRQTF_READY, }; /* * Bit masks for desc->core_internal_state__do_not_mess_with_it * * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq has been resent and will not be resent * again until the handler has run and cleared * this flag. * IRQS_WAITING - irq is waiting * IRQS_PENDING - irq needs to be resent and should be resent * at the next available opportunity. * IRQS_SUSPENDED - irq is suspended * IRQS_NMI - irq line is used to deliver NMIs * IRQS_SYSFS - descriptor has been added to sysfs */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, IRQS_NMI = 0x00002000, IRQS_SYSFS = 0x00004000, }; #include "debug.h" #include "settings.h" extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); extern void __disable_irq(struct irq_desc *desc); extern void __enable_irq(struct irq_desc *desc); #define IRQ_RESEND true #define IRQ_NORESEND false #define IRQ_START_FORCE true #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif extern int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ int check_irq_resend(struct irq_desc *desc, bool inject); void clear_irq_resend(struct irq_desc *desc); void irq_resend_init(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); void wake_threads_waitq(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif extern bool irq_can_set_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); #ifdef CONFIG_SMP extern int irq_setup_affinity(struct irq_desc *desc); #else static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } #endif /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { if (unlikely(desc->irq_data.chip->irq_bus_lock)) desc->irq_data.chip->irq_bus_lock(&desc->irq_data); } static inline void chip_bus_sync_unlock(struct irq_desc *desc) { if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } #define _IRQ_DESC_CHECK (1 << 0) #define _IRQ_DESC_PERCPU (1 << 1) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) #define for_each_action_of_desc(desc, act) \ for (act = desc->action; act; act = act->next) struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); static inline struct irq_desc * irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) { return __irq_get_desc_lock(irq, flags, true, check); } static inline void irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) { __irq_put_desc_unlock(desc, flags, true); } static inline struct irq_desc * irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) { return __irq_get_desc_lock(irq, flags, false, check); } static inline void irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) { __irq_put_desc_unlock(desc, flags, false); } #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline unsigned int irqd_get(struct irq_data *d) { return __irqd_to_state(d); } /* * Manipulation functions for irq_data.state */ static inline void irqd_set_move_pending(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING; } static inline void irqd_clr_move_pending(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; } static inline void irqd_set_managed_shutdown(struct irq_data *d) { __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN; } static inline void irqd_clr_managed_shutdown(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN; } static inline void irqd_clear(struct irq_data *d, unsigned int mask) { __irqd_to_state(d) &= ~mask; } static inline void irqd_set(struct irq_data *d, unsigned int mask) { __irqd_to_state(d) |= mask; } static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { return __irqd_to_state(d) & mask; } static inline void irq_state_set_disabled(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); } static inline void irq_state_set_masked(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } #undef __irqd_to_state static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(desc->kstat_irqs->cnt); __this_cpu_inc(kstat.irqs_sum); } static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __kstat_incr_irqs_this_cpu(desc); desc->tot_count++; } static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); } static inline int irq_desc_is_chained(struct irq_desc *desc) { return (desc->action && desc->action == &chained_action); } static inline bool irq_is_nmi(struct irq_desc *desc) { return desc->istate & IRQS_NMI; } #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif #ifdef CONFIG_IRQ_TIMINGS #define IRQ_TIMINGS_SHIFT 5 #define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT) #define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1) /** * struct irq_timings - irq timings storing structure * @values: a circular buffer of u64 encoded <timestamp,irq> values * @count: the number of elements in the array */ struct irq_timings { u64 values[IRQ_TIMINGS_SIZE]; int count; }; DECLARE_PER_CPU(struct irq_timings, irq_timings); extern void irq_timings_free(int irq); extern int irq_timings_alloc(int irq); static inline void irq_remove_timings(struct irq_desc *desc) { desc->istate &= ~IRQS_TIMINGS; irq_timings_free(irq_desc_get_irq(desc)); } static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) { int irq = irq_desc_get_irq(desc); int ret; /* * We don't need the measurement because the idle code already * knows the next expiry event. */ if (act->flags & __IRQF_TIMER) return; /* * In case the timing allocation fails, we just want to warn, * not fail, so letting the system boot anyway. */ ret = irq_timings_alloc(irq); if (ret) { pr_warn("Failed to allocate irq timing stats for irq%d (%d)", irq, ret); return; } desc->istate |= IRQS_TIMINGS; } extern void irq_timings_enable(void); extern void irq_timings_disable(void); DECLARE_STATIC_KEY_FALSE(irq_timing_enabled); /* * The interrupt number and the timestamp are encoded into a single * u64 variable to optimize the size. * 48 bit time stamp and 16 bit IRQ number is way sufficient. * Who cares an IRQ after 78 hours of idle time? */ static inline u64 irq_timing_encode(u64 timestamp, int irq) { return (timestamp << 16) | irq; } static inline int irq_timing_decode(u64 value, u64 *timestamp) { *timestamp = value >> 16; return value & U16_MAX; } static __always_inline void irq_timings_push(u64 ts, int irq) { struct irq_timings *timings = this_cpu_ptr(&irq_timings); timings->values[timings->count & IRQ_TIMINGS_MASK] = irq_timing_encode(ts, irq); timings->count++; } /* * The function record_irq_time is only called in one place in the * interrupts handler. We want this function always inline so the code * inside is embedded in the function and the static key branching * code can act at the higher level. Without the explicit * __always_inline we can end up with a function call and a small * overhead in the hotpath for nothing. */ static __always_inline void record_irq_time(struct irq_desc *desc) { if (!static_branch_likely(&irq_timing_enabled)) return; if (desc->istate & IRQS_TIMINGS) irq_timings_push(local_clock(), irq_desc_get_irq(desc)); } #else static inline void irq_remove_timings(struct irq_desc *desc) {} static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) {}; static inline void record_irq_time(struct irq_desc *desc) {} #endif /* CONFIG_IRQ_TIMINGS */ #ifdef CONFIG_GENERIC_IRQ_CHIP void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); #else static inline void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ #ifdef CONFIG_GENERIC_PENDING_IRQ static inline bool irq_can_move_pcntxt(struct irq_data *data) { return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED); } static inline bool irq_move_pending(struct irq_data *data) { return irqd_is_setaffinity_pending(data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { cpumask_copy(desc->pending_mask, mask); } static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { cpumask_copy(mask, desc->pending_mask); } static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } static inline bool irq_move_pending(struct irq_data *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return NULL; } static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; } static inline void irq_domain_deactivate_irq(struct irq_data *data) { irqd_clr_activated(data); } #endif static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd) { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY return irqd->parent_data; #else return NULL; #endif } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include <linux/debugfs.h> struct irq_bit_descr { unsigned int mask; char *name; }; #define BIT_MASK_DESCR(m) { .mask = m, .name = #m } void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, const struct irq_bit_descr *sd, int size); void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); static inline void irq_remove_debugfs_entry(struct irq_desc *desc) { debugfs_remove(desc->debugfs_file); kfree(desc->dev_name); } void irq_debugfs_copy_devname(int irq, struct device *dev); # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else static inline void irq_domain_debugfs_init(struct dentry *root) { } # endif #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) { } static inline void irq_remove_debugfs_entry(struct irq_desc *d) { } static inline void irq_debugfs_copy_devname(int irq, struct device *dev) { } #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
8 8 7 7 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * * zstd_wrapper.c */ #include <linux/mutex.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/zstd.h> #include <linux/vmalloc.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" #include "page_actor.h" struct workspace { void *mem; size_t mem_size; size_t window_size; }; static void *zstd_init(struct squashfs_sb_info *msblk, void *buff) { struct workspace *wksp = kmalloc(sizeof(*wksp), GFP_KERNEL); if (wksp == NULL) goto failed; wksp->window_size = max_t(size_t, msblk->block_size, SQUASHFS_METADATA_SIZE); wksp->mem_size = zstd_dstream_workspace_bound(wksp->window_size); wksp->mem = vmalloc(wksp->mem_size); if (wksp->mem == NULL) goto failed; return wksp; failed: ERROR("Failed to allocate zstd workspace\n"); kfree(wksp); return ERR_PTR(-ENOMEM); } static void zstd_free(void *strm) { struct workspace *wksp = strm; if (wksp) vfree(wksp->mem); kfree(wksp); } static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct workspace *wksp = strm; zstd_dstream *stream; size_t total_out = 0; int error = 0; zstd_in_buffer in_buf = { NULL, 0, 0 }; zstd_out_buffer out_buf = { NULL, 0, 0 }; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); stream = zstd_init_dstream(wksp->window_size, wksp->mem, wksp->mem_size); if (!stream) { ERROR("Failed to initialize zstd decompressor\n"); return -EIO; } out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); if (IS_ERR(out_buf.dst)) { error = PTR_ERR(out_buf.dst); goto finish; } for (;;) { size_t zstd_err; if (in_buf.pos == in_buf.size) { const void *data; int avail; if (!bio_next_segment(bio, &iter_all)) { error = -EIO; break; } avail = min(length, ((int)bvec->bv_len) - offset); data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; in_buf.pos = 0; offset = 0; } if (out_buf.pos == out_buf.size) { out_buf.dst = squashfs_next_page(output); if (IS_ERR(out_buf.dst)) { error = PTR_ERR(out_buf.dst); break; } else if (out_buf.dst == NULL) { /* Shouldn't run out of pages * before stream is done. */ error = -EIO; break; } out_buf.pos = 0; out_buf.size = PAGE_SIZE; } total_out -= out_buf.pos; zstd_err = zstd_decompress_stream(stream, &out_buf, &in_buf); total_out += out_buf.pos; /* add the additional data produced */ if (zstd_err == 0) break; if (zstd_is_error(zstd_err)) { ERROR("zstd decompression error: %d\n", (int)zstd_get_error_code(zstd_err)); error = -EIO; break; } } finish: squashfs_finish_page(output); return error ? error : total_out; } const struct squashfs_decompressor squashfs_zstd_comp_ops = { .init = zstd_init, .free = zstd_free, .decompress = zstd_uncompress, .id = ZSTD_COMPRESSION, .name = "zstd", .alloc_buffer = 1, .supported = 1 };
500 500 500 421 413 512 34 513 35 35 55 55 205 2 205 205 526 316 498 51 1 16 4 4 312 500 519 313 495 10 354 493 38 38 31 402 402 402 403 6 6 6 3 3 3 3 3 425 418 17 32 10 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" /* * Check to see if a buffer matching the given parameters is already * a part of the given transaction. */ STATIC struct xfs_buf * xfs_trans_buf_item_match( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps) { struct xfs_log_item *lip; struct xfs_buf_log_item *blip; int len = 0; int i; for (i = 0; i < nmaps; i++) len += map[i].bm_len; list_for_each_entry(lip, &tp->t_items, li_trans) { blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; } } return NULL; } /* * Add the locked buffer to the transaction. * * The buffer must be locked, and it cannot be associated with any * transaction. * * If the buffer does not yet have a buf log item associated with it, * then allocate one for it. Then add the buf item to the transaction. */ STATIC void _xfs_trans_bjoin( struct xfs_trans *tp, struct xfs_buf *bp, int reset_recur) { struct xfs_buf_log_item *bip; ASSERT(bp->b_transp == NULL); /* * The xfs_buf_log_item pointer is stored in b_log_item. If * it doesn't have one yet, then allocate one and initialize it. * The checks to see if one is there are in xfs_buf_item_init(). */ xfs_buf_item_init(bp, tp->t_mountp); bip = bp->b_log_item; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; /* * Take a reference for this transaction on the buf item. */ atomic_inc(&bip->bli_refcount); /* * Attach the item to the transaction so we can find it in * xfs_trans_get_buf() and friends. */ xfs_trans_add_item(tp, &bip->bli_item); bp->b_transp = tp; } void xfs_trans_bjoin( struct xfs_trans *tp, struct xfs_buf *bp) { _xfs_trans_bjoin(tp, bp, 0); trace_xfs_trans_bjoin(bp->b_log_item); } /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it is already locked * within the transaction, just increment its lock recursion count * and return a pointer to it. * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ int xfs_trans_get_buf_map( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_buf *bp; struct xfs_buf_log_item *bip; int error; *bpp = NULL; if (!tp) return xfs_buf_get_map(target, map, nmaps, flags, bpp); /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); if (xfs_is_shutdown(tp->t_mountp)) { xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; } ASSERT(bp->b_transp == tp); bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); *bpp = bp; return 0; } error = xfs_buf_get_map(target, map, nmaps, flags, &bp); if (error) return error; ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_log_item); *bpp = bp; return 0; } /* * Get and lock the superblock buffer for the given transaction. */ static struct xfs_buf * __xfs_trans_getsb( struct xfs_trans *tp, struct xfs_buf *bp) { /* * Just increment the lock recursion count if the buffer is already * attached to this transaction. */ if (bp->b_transp == tp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_getsb_recur(bip); } else { xfs_buf_lock(bp); xfs_buf_hold(bp); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_getsb(bp->b_log_item); } return bp; } struct xfs_buf * xfs_trans_getsb( struct xfs_trans *tp) { return __xfs_trans_getsb(tp, tp->t_mountp->m_sb_bp); } struct xfs_buf * xfs_trans_getrtsb( struct xfs_trans *tp) { if (!tp->t_mountp->m_rtsb_bp) return NULL; return __xfs_trans_getsb(tp, tp->t_mountp->m_rtsb_bp); } /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been * read in, read it from disk. If it is already locked * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ int xfs_trans_read_buf_map( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp = NULL; struct xfs_buf_log_item *bip; int error; *bpp = NULL; /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already * have it locked. If it is already read in we just increment * the lock recursion count and return the buffer to the caller. * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ if (tp) bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_log_item != NULL); ASSERT(!bp->b_error); ASSERT(bp->b_flags & XBF_DONE); /* * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ if (xfs_is_shutdown(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } /* * Check if the caller is trying to read a buffer that is * already attached to the transaction yet has no buffer ops * assigned. Ops are usually attached when the buffer is * attached to the transaction, or by the read caller if * special circumstances. That didn't happen, which is not * how this is supposed to go. * * If the buffer passes verification we'll let this go, but if * not we have to shut down. Let the transaction cleanup code * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __return_address); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; return error; } bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_read_buf_recur(bip); ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops, __return_address); switch (error) { case 0: break; default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); fallthrough; case -ENOMEM: case -EAGAIN: return error; } if (xfs_is_shutdown(mp)) { xfs_buf_relse(bp); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } if (tp) { _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_log_item); } ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } /* Has this buffer been dirtied by anyone? */ bool xfs_trans_buf_is_dirty( struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; if (!bip) return false; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* * Release a buffer previously joined to the transaction. If the buffer is * modified within this transaction, decrement the recursion count but do not * release the buffer even if the count goes to 0. If the buffer is not modified * within the transaction, decrement the recursion count and release the buffer * if the recursion count goes to 0. * * If the buffer is to be released and it was not already dirty before this * transaction began, then also free the buf_log_item associated with it. * * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); if (!tp) { xfs_buf_relse(bp); return; } trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* * If the release is for a recursive lookup, then decrement the count * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; return; } /* * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; if (bip->bli_flags & XFS_BLI_STALE) return; /* * Unlink the log item from the transaction and clear the hold flag, if * set. We wouldn't want the next user of the buffer to get confused. */ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); bip->bli_flags &= ~XFS_BLI_HOLD; /* drop the reference to the bli */ xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); } /* * Forcibly detach a buffer previously joined to the transaction. The caller * will retain its locked reference to the buffer after this function returns. * The buffer must be completely clean and must not be held to the transaction. */ void xfs_trans_bdetach( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(tp != NULL); ASSERT(bp->b_transp == tp); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_bdetach(bip); /* * Erase all recursion count, since we're removing this buffer from the * transaction. */ bip->bli_recur = 0; /* * The buffer must be completely clean. Specifically, it had better * not be dirty, stale, logged, ordered, or held to the transaction. */ ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY)); ASSERT(!(bip->bli_flags & XFS_BLI_HOLD)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); /* Unlink the log item from the transaction and drop the log item. */ xfs_trans_del_item(&bip->bli_item); xfs_buf_item_put(bip); bp->b_transp = NULL; } /* * Mark the buffer as not needing to be unlocked when the buf item's * iop_committing() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ void xfs_trans_bhold( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); } /* * Cancel the previous buffer hold request made on this buffer * for this transaction. */ void xfs_trans_bhold_release( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; trace_xfs_trans_bhold_release(bip); } /* * Mark a buffer dirty in the transaction. */ void xfs_trans_dirty_buf( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); /* * Mark the buffer as needing to be written out eventually, * and set its iodone function to remove the buffer's buf log * item from the AIL and free it when the buffer is flushed * to disk. */ bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer * again. There are no races with the code in xfs_buf_item_unpin(), * because we have a reference to the buffer this entire time. */ if (bip->bli_flags & XFS_BLI_STALE) { bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(bp->b_flags & XBF_STALE); bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* * This is called to mark bytes first through last inclusive of the given * buffer as needing to be logged when the transaction is committed. * The buffer must already be associated with the given transaction. * * First and last are numbers relative to the beginning of this buffer, * so the first byte in the buffer is numbered 0 regardless of the * value of b_blkno. */ void xfs_trans_log_buf( struct xfs_trans *tp, struct xfs_buf *bp, uint first, uint last) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); xfs_trans_dirty_buf(tp, bp); trace_xfs_trans_log_buf(bip); xfs_buf_item_log(bip, first, last); } /* * Invalidate a buffer that is being used within a transaction. * * Typically this is because the blocks in the buffer are being freed, so we * need to prevent it from being written out when we're done. Allowing it * to be written again might overwrite data in the free blocks if they are * reallocated to a file. * * We prevent the buffer from being written out by marking it stale. We can't * get rid of the buf log item at this point because the buffer may still be * pinned by another transaction. If that is the case, then we'll wait until * the buffer is committed to disk for the last time (we can tell by the ref * count) and free it in xfs_buf_item_unpin(). Until that happens we will * keep the buffer locked so that the buffer and buf log item are not reused. * * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log * the buf item. This will be used at recovery time to determine that copies * of the buffer in the log before this should not be replayed. * * We mark the item descriptor and the transaction dirty so that we'll hold * the buffer until after the commit. * * Since we're invalidating the buffer, we also clear the state about which * parts of the buffer have been logged. We also clear the flag indicating * that this is an inode buffer since the data in the buffer will no longer * be valid. * * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; int i; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_binval(bip); if (bip->bli_flags & XFS_BLI_STALE) { /* * If the buffer is already invalidated, then * just return. */ ASSERT(bp->b_flags & XBF_STALE); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); } set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); tp->t_flags |= XFS_TRANS_DIRTY; } /* * This call is used to indicate that the buffer contains on-disk inodes which * must be handled specially during recovery. They require special handling * because only the di_next_unlinked from the inodes in the buffer should be * recovered. The rest of the data in the buffer is logged via the inodes * themselves. * * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be * transferred to the buffer's log format structure so that we'll know what to * do at recovery time. */ void xfs_trans_inode_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be * prevented as the buffer may have been reused. */ void xfs_trans_stale_inode_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * Mark the buffer as being one which contains newly allocated * inodes. We need to make sure that even if this buffer is * relogged as an 'inode buf' we still recover all of the inode * images in the face of a crash. This works in coordination with * xfs_buf_item_committed() to ensure that the buffer remains in the * AIL at its original location even after it has been relogged. */ /* ARGSUSED */ void xfs_trans_inode_alloc_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; bp->b_iodone = xfs_buf_inode_iodone; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * Mark the buffer as ordered for this transaction. This means that the contents * of the buffer are not recorded in the transaction but it is tracked in the * AIL as though it was. This allows us to record logical changes in * transactions rather than the physical changes we make to the buffer without * changing writeback ordering constraints of metadata buffers. */ bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); if (xfs_buf_item_dirty_format(bip)) return false; bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); /* * We don't log a dirty range of an ordered buffer but it still needs * to be marked dirty and that it has been logged. */ xfs_trans_dirty_buf(tp, bp); return true; } /* * Set the type of the buffer for log recovery so that it can correctly identify * and hence attach the correct buffer ops to the buffer after replay. */ void xfs_trans_buf_set_type( struct xfs_trans *tp, struct xfs_buf *bp, enum xfs_blft type) { struct xfs_buf_log_item *bip = bp->b_log_item; if (!tp) return; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); xfs_blft_to_flags(&bip->__bli_format, type); } void xfs_trans_buf_copy_type( struct xfs_buf *dst_bp, struct xfs_buf *src_bp) { struct xfs_buf_log_item *sbip = src_bp->b_log_item; struct xfs_buf_log_item *dbip = dst_bp->b_log_item; enum xfs_blft type; type = xfs_blft_from_flags(&sbip->__bli_format); xfs_blft_to_flags(&dbip->__bli_format, type); } /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of * dquots. However, unlike in inode buffer recovery, dquot buffers get * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). * The only thing that makes dquot buffers different from regular * buffers is that we must not replay dquot bufs when recovering * if a _corresponding_ quotaoff has happened. We also have to distinguish * between usr dquot bufs and grp dquot bufs, because usr and grp quotas * can be turned off independently. */ /* ARGSUSED */ void xfs_trans_dquot_buf( xfs_trans_t *tp, struct xfs_buf *bp, uint type) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); bip->__bli_format.blf_flags |= type; switch (type) { case XFS_BLF_UDQUOT_BUF: type = XFS_BLFT_UDQUOT_BUF; break; case XFS_BLF_PDQUOT_BUF: type = XFS_BLFT_PDQUOT_BUF; break; case XFS_BLF_GDQUOT_BUF: type = XFS_BLFT_GDQUOT_BUF; break; default: type = XFS_BLFT_UNKNOWN_BUF; break; } bp->b_iodone = xfs_buf_dquot_iodone; xfs_trans_buf_set_type(tp, bp, type); }
1757 1752 1659 771 1567 570 1371 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 // SPDX-License-Identifier: GPL-2.0 /* * linux/lib/kasprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/stdarg.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> /* Simplified asprintf. */ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int first, second; char *p; va_list aq; va_copy(aq, ap); first = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc_track_caller(first+1, gfp); if (!p) return NULL; second = vsnprintf(p, first+1, fmt, ap); WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)", first, second, fmt); return p; } EXPORT_SYMBOL(kvasprintf); /* * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt * (or the sole vararg) points to rodata, we will then save a memory * allocation and string copy. In any case, the return value should be * freed using kfree_const(). */ const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) { if (!strchr(fmt, '%')) return kstrdup_const(fmt, gfp); if (!strcmp(fmt, "%s")) return kstrdup_const(va_arg(ap, const char*), gfp); return kvasprintf(gfp, fmt, ap); } EXPORT_SYMBOL(kvasprintf_const); char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL(kasprintf);
1383 1381 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template.c * Helpers to manage template descriptors. */ #include <linux/rculist.h> #include "ima.h" #include "ima_template_lib.h" enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME, HDR_TEMPLATE_DATA, HDR__LAST }; static struct ima_template_desc builtin_templates[] = { {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT}, {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "ima-ngv2", .fmt = "d-ngv2|n-ng"}, {.name = "ima-sigv2", .fmt = "d-ngv2|n-ng|sig"}, {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"}, {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"}, {.name = "evm-sig", .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); static int template_setup_done; static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, .field_show = ima_show_template_digest}, {.field_id = "n", .field_init = ima_eventname_init, .field_show = ima_show_template_string}, {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init, .field_show = ima_show_template_digest_ng}, {.field_id = "d-ngv2", .field_init = ima_eventdigest_ngv2_init, .field_show = ima_show_template_digest_ngv2}, {.field_id = "n-ng", .field_init = ima_eventname_ng_init, .field_show = ima_show_template_string}, {.field_id = "sig", .field_init = ima_eventsig_init, .field_show = ima_show_template_sig}, {.field_id = "buf", .field_init = ima_eventbuf_init, .field_show = ima_show_template_buf}, {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init, .field_show = ima_show_template_digest_ng}, {.field_id = "modsig", .field_init = ima_eventmodsig_init, .field_show = ima_show_template_sig}, {.field_id = "evmsig", .field_init = ima_eventevmsig_init, .field_show = ima_show_template_sig}, {.field_id = "iuid", .field_init = ima_eventinodeuid_init, .field_show = ima_show_template_uint}, {.field_id = "igid", .field_init = ima_eventinodegid_init, .field_show = ima_show_template_uint}, {.field_id = "imode", .field_init = ima_eventinodemode_init, .field_show = ima_show_template_uint}, {.field_id = "xattrnames", .field_init = ima_eventinodexattrnames_init, .field_show = ima_show_template_string}, {.field_id = "xattrlengths", .field_init = ima_eventinodexattrlengths_init, .field_show = ima_show_template_sig}, {.field_id = "xattrvalues", .field_init = ima_eventinodexattrvalues_init, .field_show = ima_show_template_sig}, }; /* * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't * need to be accounted for since they shouldn't be defined in the same template * description as 'd-ng' and 'n-ng' respectively. */ #define MAX_TEMPLATE_NAME_LEN \ sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode") static struct ima_template_desc *ima_template; static struct ima_template_desc *ima_buf_template; /** * ima_template_has_modsig - Check whether template has modsig-related fields. * @ima_template: IMA template to check. * * Tells whether the given template has fields referencing a file's appended * signature. */ bool ima_template_has_modsig(const struct ima_template_desc *ima_template) { int i; for (i = 0; i < ima_template->num_fields; i++) if (!strcmp(ima_template->fields[i]->field_id, "modsig") || !strcmp(ima_template->fields[i]->field_id, "d-modsig")) return true; return false; } static int __init ima_template_setup(char *str) { struct ima_template_desc *template_desc; int template_len = strlen(str); if (template_setup_done) return 1; if (!ima_template) ima_init_template_list(); /* * Verify that a template with the supplied name exists. * If not, use CONFIG_IMA_DEFAULT_TEMPLATE. */ template_desc = lookup_template_desc(str); if (!template_desc) { pr_err("template %s not found, using %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } /* * Verify whether the current hash algorithm is supported * by the 'ima' template. */ if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 && ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) { pr_err("template does not support hash alg\n"); return 1; } ima_template = template_desc; template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { pr_err("format string '%s' not valid, using template %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; template_setup_done = 1; return 1; } __setup("ima_template_fmt=", ima_template_fmt_setup); struct ima_template_desc *lookup_template_desc(const char *name) { struct ima_template_desc *template_desc; int found = 0; rcu_read_lock(); list_for_each_entry_rcu(template_desc, &defined_templates, list) { if ((strcmp(template_desc->name, name) == 0) || (strcmp(template_desc->fmt, name) == 0)) { found = 1; break; } } rcu_read_unlock(); return found ? template_desc : NULL; } static const struct ima_template_field * lookup_template_field(const char *field_id) { int i; for (i = 0; i < ARRAY_SIZE(supported_fields); i++) if (strncmp(supported_fields[i].field_id, field_id, IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0) return &supported_fields[i]; return NULL; } static int template_fmt_size(const char *template_fmt) { char c; int template_fmt_len = strlen(template_fmt); int i = 0, j = 0; while (i < template_fmt_len) { c = template_fmt[i]; if (c == '|') j++; i++; } return j + 1; } int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields) { const char *template_fmt_ptr; const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; int template_num_fields; int i, len; if (num_fields && *num_fields > 0) /* already initialized? */ return 0; template_num_fields = template_fmt_size(template_fmt); if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) { pr_err("format string '%s' contains too many fields\n", template_fmt); return -EINVAL; } for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields; i++, template_fmt_ptr += len + 1) { char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1]; len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr; if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) { pr_err("Invalid field with length %d\n", len); return -EINVAL; } memcpy(tmp_field_id, template_fmt_ptr, len); tmp_field_id[len] = '\0'; found_fields[i] = lookup_template_field(tmp_field_id); if (!found_fields[i]) { pr_err("field '%s' not found\n", tmp_field_id); return -ENOENT; } } if (fields && num_fields) { *fields = kmalloc_array(i, sizeof(**fields), GFP_KERNEL); if (*fields == NULL) return -ENOMEM; memcpy(*fields, found_fields, i * sizeof(**fields)); *num_fields = i; } return 0; } void ima_init_template_list(void) { int i; if (!list_empty(&defined_templates)) return; spin_lock(&template_list); for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) { list_add_tail_rcu(&builtin_templates[i].list, &defined_templates); } spin_unlock(&template_list); } struct ima_template_desc *ima_template_desc_current(void) { if (!ima_template) { ima_init_template_list(); ima_template = lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE); } return ima_template; } struct ima_template_desc *ima_template_desc_buf(void) { if (!ima_buf_template) { ima_init_template_list(); ima_buf_template = lookup_template_desc("ima-buf"); } return ima_buf_template; } int __init ima_init_template(void) { struct ima_template_desc *template = ima_template_desc_current(); int result; result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) { pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } template = ima_template_desc_buf(); if (!template) { pr_err("Failed to get ima-buf template\n"); return -EINVAL; } result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } static struct ima_template_desc *restore_template_fmt(char *template_name) { struct ima_template_desc *template_desc = NULL; int ret; ret = template_desc_init_fields(template_name, NULL, NULL); if (ret < 0) { pr_err("attempting to initialize the template \"%s\" failed\n", template_name); goto out; } template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL); if (!template_desc) goto out; template_desc->name = ""; template_desc->fmt = kstrdup(template_name, GFP_KERNEL); if (!template_desc->fmt) { kfree(template_desc); template_desc = NULL; goto out; } spin_lock(&template_list); list_add_tail_rcu(&template_desc->list, &defined_templates); spin_unlock(&template_list); out: return template_desc; } static int ima_restore_template_data(struct ima_template_desc *template_desc, void *template_data, int template_data_size, struct ima_template_entry **entry) { struct tpm_digest *digests; int ret = 0; int i; *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); return -ENOMEM; } (*entry)->digests = digests; ret = ima_parse_buf(template_data, template_data + template_data_size, NULL, template_desc->num_fields, (*entry)->template_data, NULL, NULL, ENFORCE_FIELDS | ENFORCE_BUFEND, "template data"); if (ret < 0) { kfree((*entry)->digests); kfree(*entry); return ret; } (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { struct ima_field_data *field_data = &(*entry)->template_data[i]; u8 *data = field_data->data; (*entry)->template_data[i].data = kzalloc(field_data->len + 1, GFP_KERNEL); if (!(*entry)->template_data[i].data) { ret = -ENOMEM; break; } memcpy((*entry)->template_data[i].data, data, field_data->len); (*entry)->template_data_len += sizeof(field_data->len); (*entry)->template_data_len += field_data->len; } if (ret < 0) { ima_free_template_entry(*entry); *entry = NULL; } return ret; } /* Restore the serialized binary measurement list without extending PCRs. */ int ima_restore_measurement_list(loff_t size, void *buf) { char template_name[MAX_TEMPLATE_NAME_LEN]; unsigned char zero[TPM_DIGEST_SIZE] = { 0 }; struct ima_kexec_hdr *khdr = buf; struct ima_field_data hdr[HDR__LAST] = { [HDR_PCR] = {.len = sizeof(u32)}, [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE}, }; void *bufp = buf + sizeof(*khdr); void *bufendp; struct ima_template_entry *entry; struct ima_template_desc *template_desc; DECLARE_BITMAP(hdr_mask, HDR__LAST); unsigned long count = 0; int ret = 0; if (!buf || size < sizeof(*khdr)) return 0; if (ima_canonical_fmt) { khdr->version = le16_to_cpu((__force __le16)khdr->version); khdr->count = le64_to_cpu((__force __le64)khdr->count); khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size); } if (khdr->version != 1) { pr_err("attempting to restore a incompatible measurement list"); return -EINVAL; } if (khdr->count > ULONG_MAX - 1) { pr_err("attempting to restore too many measurements"); return -EINVAL; } bitmap_zero(hdr_mask, HDR__LAST); bitmap_set(hdr_mask, HDR_PCR, 1); bitmap_set(hdr_mask, HDR_DIGEST, 1); /* * ima kexec buffer prefix: version, buffer size, count * v1 format: pcr, digest, template-name-len, template-name, * template-data-size, template-data */ bufendp = buf + khdr->buffer_size; while ((bufp < bufendp) && (count++ < khdr->count)) { int enforce_mask = ENFORCE_FIELDS; enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0; ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL, hdr_mask, enforce_mask, "entry header"); if (ret < 0) break; if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) { pr_err("attempting to restore a template name that is too long\n"); ret = -EINVAL; break; } /* template name is not null terminated */ memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data, hdr[HDR_TEMPLATE_NAME].len); template_name[hdr[HDR_TEMPLATE_NAME].len] = 0; if (strcmp(template_name, "ima") == 0) { pr_err("attempting to restore an unsupported template \"%s\" failed\n", template_name); ret = -EINVAL; break; } template_desc = lookup_template_desc(template_name); if (!template_desc) { template_desc = restore_template_fmt(template_name); if (!template_desc) break; } /* * Only the running system's template format is initialized * on boot. As needed, initialize the other template formats. */ ret = template_desc_init_fields(template_desc->fmt, &(template_desc->fields), &(template_desc->num_fields)); if (ret < 0) { pr_err("attempting to restore the template fmt \"%s\" failed\n", template_desc->fmt); ret = -EINVAL; break; } ret = ima_restore_template_data(template_desc, hdr[HDR_TEMPLATE_DATA].data, hdr[HDR_TEMPLATE_DATA].len, &entry); if (ret < 0) break; if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) { ret = ima_calc_field_array_hash( &entry->template_data[0], entry); if (ret < 0) { pr_err("cannot calculate template digest\n"); ret = -EINVAL; break; } } entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) : le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data)); ret = ima_restore_measurement_entry(entry); if (ret < 0) break; } return ret; }
4428 1159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KERNEL_PRINTK__ #define __KERNEL_PRINTK__ #include <linux/stdarg.h> #include <linux/init.h> #include <linux/kern_levels.h> #include <linux/linkage.h> #include <linux/ratelimit_types.h> #include <linux/once_lite.h> struct console; extern const char linux_banner[]; extern const char linux_proc_banner[]; extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ #define PRINTK_MAX_SINGLE_HEADER_LEN 2 static inline int printk_get_level(const char *buffer) { if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { switch (buffer[1]) { case '0' ... '7': case 'c': /* KERN_CONT */ return buffer[1]; } } return 0; } static inline const char *printk_skip_level(const char *buffer) { if (printk_get_level(buffer)) return buffer + 2; return buffer; } static inline const char *printk_skip_headers(const char *buffer) { while (printk_get_level(buffer)) buffer = printk_skip_level(buffer); return buffer; } /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ #define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ #define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ #define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ /* * Default used to be hard-coded at 7, quiet used to be hardcoded at 4, * we're now allowing both to be set from kernel config. */ #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET int match_devname_and_update_preferred_console(const char *match, const char *name, const short idx); extern int console_printk[]; #define console_loglevel (console_printk[0]) #define default_message_loglevel (console_printk[1]) #define minimum_console_loglevel (console_printk[2]) #define default_console_loglevel (console_printk[3]) extern void console_verbose(void); /* strlen("ratelimit") + 1 */ #define DEVKMSG_STR_MAX_SIZE 10 extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE]; struct ctl_table; extern int suppress_printk; struct va_format { const char *fmt; va_list *va; }; /* * FW_BUG * Add this to a message where you are sure the firmware is buggy or behaves * really stupid or out of spec. Be aware that the responsible BIOS developer * should be able to fix this issue or at least get a concrete idea of the * problem by reading your message without the need of looking at the kernel * code. * * Use it for definite and high priority BIOS bugs. * * FW_WARN * Use it for not that clear (e.g. could the kernel messed up things already?) * and medium priority BIOS bugs. * * FW_INFO * Use this one if you want to tell the user or vendor about something * suspicious, but generally harmless related to the firmware. * * Use it for information or very low priority BIOS bugs. */ #define FW_BUG "[Firmware Bug]: " #define FW_WARN "[Firmware Warn]: " #define FW_INFO "[Firmware Info]: " /* * HW_ERR * Add this to a message for hardware errors, so that user can report * it to hardware vendor instead of LKML or software vendor. */ #define HW_ERR "[Hardware Error]: " /* * DEPRECATED * Add this to a message whenever you want to warn user space about the use * of a deprecated aspect of an API so they can stop using it */ #define DEPRECATED "[Deprecated]: " /* * Dummy printk for disabled debugging statements to use whilst maintaining * gcc's format checking. */ #define no_printk(fmt, ...) \ ({ \ if (0) \ _printk(fmt, ##__VA_ARGS__); \ 0; \ }) #ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) void early_printk(const char *fmt, ...); #else static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif struct dev_printk_info; #ifdef CONFIG_PRINTK asmlinkage __printf(4, 0) int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args); asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int _printk(const char *fmt, ...); /* * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); extern void __printk_deferred_enter(void); extern void __printk_deferred_exit(void); extern void printk_force_console_enter(void); extern void printk_force_console_exit(void); /* * The printk_deferred_enter/exit macros are available only as a hack for * some code paths that need to defer all printk console printing. Interrupts * must be disabled for the deferred duration. */ #define printk_deferred_enter() __printk_deferred_enter() #define printk_deferred_exit() __printk_deferred_exit() /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use * printk_ratelimited() or plain old __ratelimit(). */ extern int __printk_ratelimit(const char *func); #define printk_ratelimit() __printk_ratelimit(__func__) extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); extern int printk_delay_msec; extern int dmesg_restrict; extern void wake_up_klogd(void); char *log_buf_addr_get(void); u32 log_buf_len_get(void); void log_buf_vmcoreinfo_setup(void); void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); void console_try_replay_all(void); void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) { return 0; } static inline __printf(1, 2) __cold int _printk(const char *s, ...) { return 0; } static inline __printf(1, 2) __cold int _printk_deferred(const char *s, ...) { return 0; } static inline void printk_deferred_enter(void) { } static inline void printk_deferred_exit(void) { } static inline void printk_force_console_enter(void) { } static inline void printk_force_console_exit(void) { } static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec) { return false; } static inline void wake_up_klogd(void) { } static inline char *log_buf_addr_get(void) { return NULL; } static inline u32 log_buf_len_get(void) { return 0; } static inline void log_buf_vmcoreinfo_setup(void) { } static inline void setup_log_buf(int early) { } static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...) { } static inline void dump_stack_print_info(const char *log_lvl) { } static inline void show_regs_print_info(const char *log_lvl) { } static inline void dump_stack_lvl(const char *log_lvl) { } static inline void dump_stack(void) { } static inline void printk_trigger_flush(void) { } static inline void console_try_replay_all(void) { } static inline void printk_legacy_allow_panic_sync(void) { } static inline bool nbcon_device_try_acquire(struct console *con) { return false; } static inline void nbcon_device_release(struct console *con) { } static inline void nbcon_atomic_flush_unsafe(void) { } #endif bool this_cpu_in_panic(void); #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); extern void __printk_cpu_sync_put(void); #else #define __printk_cpu_sync_try_get() true #define __printk_cpu_sync_wait() #define __printk_cpu_sync_put() #endif /* CONFIG_SMP */ /** * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk * cpu-reentrant spinning lock. * @flags: Stack-allocated storage for saving local interrupt state, * to be passed to printk_cpu_sync_put_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. * * CAUTION: This function must be used carefully. It does not behave like a * typical lock. Here are important things to watch out for... * * * This function is reentrant on the same CPU. Therefore the calling * code must not assume exclusive access to data if code accessing the * data can run reentrant or within NMI context on the same CPU. * * * If there exists usage of this function from NMI context, it becomes * unsafe to perform any type of locking or spinning to wait for other * CPUs after calling this function from any context. This includes * using spinlocks or any other busy-waiting synchronization methods. */ #define printk_cpu_sync_get_irqsave(flags) \ for (;;) { \ local_irq_save(flags); \ if (__printk_cpu_sync_try_get()) \ break; \ local_irq_restore(flags); \ __printk_cpu_sync_wait(); \ } /** * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning * lock and restore interrupts. * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave(). */ #define printk_cpu_sync_put_irqrestore(flags) \ do { \ __printk_cpu_sync_put(); \ local_irq_restore(flags); \ } while (0) extern int kptr_restrict; /** * pr_fmt - used by the pr_*() macros to generate the printk format string * @fmt: format string passed from a pr_*() macro * * This macro can be used to generate a unified format string for pr_*() * macros. A common use is to prefix all pr_*() messages in a file with a common * string. For example, defining this at the top of a source file: * * #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt * * would prefix all pr_info, pr_emerg... messages in the file with the module * name. */ #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif struct module; #ifdef CONFIG_PRINTK_INDEX struct pi_entry { const char *fmt; const char *func; const char *file; unsigned int line; /* * While printk and pr_* have the level stored in the string at compile * time, some subsystems dynamically add it at runtime through the * format string. For these dynamic cases, we allow the subsystem to * tell us the level at compile time. * * NULL indicates that the level, if any, is stored in fmt. */ const char *level; /* * The format string used by various subsystem specific printk() * wrappers to prefix the message. * * Note that the static prefix defined by the pr_fmt() macro is stored * directly in the message format (@fmt), not here. */ const char *subsys_fmt_prefix; } __packed; #define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix) \ do { \ if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \ /* * We check __builtin_constant_p multiple times here * for the same input because GCC will produce an error * if we try to assign a static variable to fmt if it * is not a constant, even with the outer if statement. */ \ static const struct pi_entry _entry \ __used = { \ .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \ .func = __func__, \ .file = __FILE__, \ .line = __LINE__, \ .level = __builtin_constant_p(_level) ? (_level) : NULL, \ .subsys_fmt_prefix = _subsys_fmt_prefix,\ }; \ static const struct pi_entry *_entry_ptr \ __used __section(".printk_index") = &_entry; \ } \ } while (0) #else /* !CONFIG_PRINTK_INDEX */ #define __printk_index_emit(...) do {} while (0) #endif /* CONFIG_PRINTK_INDEX */ /* * Some subsystems have their own custom printk that applies a va_format to a * generic format, for example, to include a device number or other metadata * alongside the format supplied by the caller. * * In order to store these in the way they would be emitted by the printk * infrastructure, the subsystem provides us with the start, fixed string, and * any subsequent text in the format string. * * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the * first one. * * subsys_fmt_prefix must be known at compile time, or compilation will fail * (since this is a mistake). If fmt or level is not known at compile time, no * index entry will be made (since this can legitimately happen). */ #define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \ __printk_index_emit(fmt, level, subsys_fmt_prefix) #define printk_index_wrap(_p_func, _fmt, ...) \ ({ \ __printk_index_emit(_fmt, NULL, NULL); \ _p_func(_fmt, ##__VA_ARGS__); \ }) /** * printk - print a kernel message * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. * * If printk indexing is enabled, _printk() is called from printk_index_wrap. * Otherwise, printk is simply #defined to _printk. * * We try to grab the console_lock. If we succeed, it's easy - we log the * output and call the console drivers. If we fail to get the semaphore, we * place the output into the log buffer and return. The current holder of * the console_sem will notice the new output in console_unlock(); and will * send it to the consoles before releasing the lock. * * One effect of this deferred printing is that code which calls printk() and * then changes console_loglevel may break. This is because console_loglevel * is inspected when the actual printing occurs. * * See also: * printf(3) * * See the vsnprintf() documentation for format string extensions over C99. */ #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) #define printk_deferred(fmt, ...) \ printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__) /** * pr_emerg - Print an emergency-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) /** * pr_alert - Print an alert-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_alert(fmt, ...) \ printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) /** * pr_crit - Print a critical-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_crit(fmt, ...) \ printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) /** * pr_err - Print an error-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) /** * pr_warn - Print a warning-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt() * to generate the format string. */ #define pr_warn(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) /** * pr_notice - Print a notice-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_notice(fmt, ...) \ printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) /** * pr_info - Print an info-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /** * pr_cont - Continues a previous log message in the same line. * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_CONT loglevel. It should only be * used when continuing a log message with no newline ('\n') enclosed. Otherwise * it defaults back to KERN_DEFAULT loglevel. */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) /** * pr_devel - Print a debug-level message conditionally * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is * defined. Otherwise it does nothing. * * It uses pr_fmt() to generate the format string. */ #ifdef DEBUG #define pr_devel(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include <linux/dynamic_debug.h> /** * pr_debug - Print a debug-level message conditionally * @fmt: format string * @...: arguments for the format string * * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing. * * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses * pr_fmt() internally). */ #define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* * Print a one-time message (analogous to WARN_ONCE() et al): */ #ifdef CONFIG_PRINTK #define printk_once(fmt, ...) \ DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__) #define printk_deferred_once(fmt, ...) \ DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #define printk_deferred_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert_once(fmt, ...) \ printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit_once(fmt, ...) \ printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err_once(fmt, ...) \ printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn_once(fmt, ...) \ printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_notice_once(fmt, ...) \ printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_once(fmt, ...) \ printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* no pr_cont_once, don't do that... */ #if defined(DEBUG) #define pr_devel_once(fmt, ...) \ printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel_once(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(DEBUG) #define pr_debug_once(fmt, ...) \ printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug_once(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* * ratelimited messages with local ratelimit_state, * no local ratelimit_state used in the !PRINTK case */ #ifdef CONFIG_PRINTK #define printk_ratelimited(fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ \ if (__ratelimit(&_rs)) \ printk(fmt, ##__VA_ARGS__); \ }) #else #define printk_ratelimited(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_ratelimited(fmt, ...) \ printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert_ratelimited(fmt, ...) \ printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit_ratelimited(fmt, ...) \ printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err_ratelimited(fmt, ...) \ printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn_ratelimited(fmt, ...) \ printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_notice_ratelimited(fmt, ...) \ printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_ratelimited(fmt, ...) \ printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* no pr_cont_ratelimited, don't do that... */ #if defined(DEBUG) #define pr_devel_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \ __ratelimit(&_rs)) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #elif defined(DEBUG) #define pr_debug_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif extern const struct file_operations kmsg_fops; enum { DUMP_PREFIX_NONE, DUMP_PREFIX_ADDRESS, DUMP_PREFIX_OFFSET }; extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii); #ifdef CONFIG_PRINTK extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { } static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, const void *buf, size_t len) { } #endif #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) #elif defined(DEBUG) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) #else static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { } #endif /** * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @buf: data blob to dump * @len: number of bytes in the @buf * * Calls print_hex_dump(), with log level of KERN_DEBUG, * rowsize of 16, groupsize of 1, and ASCII output included. */ #define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) #endif
5 1 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM notifier #if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NOTIFIERS_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(notifier_info, TP_PROTO(void *cb), TP_ARGS(cb), TP_STRUCT__entry( __field(void *, cb) ), TP_fast_assign( __entry->cb = cb; ), TP_printk("%ps", __entry->cb) ); /* * notifier_register - called upon notifier callback registration * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_register, TP_PROTO(void *cb), TP_ARGS(cb) ); /* * notifier_unregister - called upon notifier callback unregistration * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_unregister, TP_PROTO(void *cb), TP_ARGS(cb) ); /* * notifier_run - called upon notifier callback execution * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_run, TP_PROTO(void *cb), TP_ARGS(cb) ); #endif /* _TRACE_NOTIFIERS_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
51 28 28 28 3 48 53 53 48 18 48 13 51 51 51 51 10 9 49 18 51 3 27 4 27 8 45 45 19 28 11 13 28 10 28 28 3 25 28 28 17 1 17 49 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; if (rac) folio = readahead_folio(rac); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map.m_pblk; else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_page - 1; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); }
3629 22 1314 1311 1 105 209 209 3352 17 23 3 23 4 4 7 4 2 5 311 310 3712 3710 1 3622 2513 3620 3630 4 3620 11 18 2511 2056 296 1 211 1997 1 1 1 1 1 1 2 2 211 211 6062 1275 1275 1278 1276 1277 7 1268 128 127 287 287 130 130 613 373 268 7805 919 7799 7635 7806 810 810 264 121 121 121 121 121 110 110 199 77 77 77 89 99 100 33 100 100 5342 3469 7 2141 3235 897 66 66 205 5 201 1436 1351 86 21 39 39 38 38 39 4667 209 209 209 4574 168 5 5 4 4 4 6034 4253 2 1355 2 2242 3085 8 1869 1311 911 555 4 2 2 2 2 2 2 2 1843 1845 16 18 18 18 1 212 211 212 40 40 24 16 16 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 // SPDX-License-Identifier: GPL-2.0-only /* * Simplified MAC Kernel (smack) security module * * This file contains the smack hook function implementations. * * Authors: * Casey Schaufler <casey@schaufler-ca.com> * Jarkko Sakkinen <jarkko.sakkinen@intel.com> * * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com> * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. * Paul Moore <paul@paul-moore.com> * Copyright (C) 2010 Nokia Corporation * Copyright (C) 2011 Intel Corporation. */ #include <linux/xattr.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/stat.h> #include <linux/kd.h> #include <asm/ioctls.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/icmpv6.h> #include <linux/slab.h> #include <linux/mutex.h> #include <net/cipso_ipv4.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/audit.h> #include <linux/magic.h> #include <linux/dcache.h> #include <linux/personality.h> #include <linux/msg.h> #include <linux/shm.h> #include <uapi/linux/shm.h> #include <linux/binfmts.h> #include <linux/parser.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/watch_queue.h> #include <linux/io_uring/cmd.h> #include <uapi/linux/lsm.h> #include "smack.h" #define TRANS_TRUE "TRUE" #define TRANS_TRUE_SIZE 4 #define SMK_CONNECTING 0 #define SMK_RECEIVING 1 #define SMK_SENDING 2 /* * Smack uses multiple xattrs. * SMACK64 - for access control, * SMACK64TRANSMUTE - label initialization, * Not saved on files - SMACK64IPIN and SMACK64IPOUT, * Must be set explicitly - SMACK64EXEC and SMACK64MMAP */ #define SMACK_INODE_INIT_XATTRS 2 #ifdef SMACK_IPV6_PORT_LABELING static DEFINE_MUTEX(smack_ipv6_lock); static LIST_HEAD(smk_ipv6_port_list); #endif struct kmem_cache *smack_rule_cache; int smack_enabled __initdata; #define A(s) {"smack"#s, sizeof("smack"#s) - 1, Opt_##s} static struct { const char *name; int len; int opt; } smk_mount_opts[] = { {"smackfsdef", sizeof("smackfsdef") - 1, Opt_fsdefault}, A(fsdefault), A(fsfloor), A(fshat), A(fsroot), A(fstransmute) }; #undef A static int match_opt_prefix(char *s, int l, char **arg) { int i; for (i = 0; i < ARRAY_SIZE(smk_mount_opts); i++) { size_t len = smk_mount_opts[i].len; if (len > l || memcmp(s, smk_mount_opts[i].name, len)) continue; if (len == l || s[len] != '=') continue; *arg = s + len + 1; return smk_mount_opts[i].opt; } return Opt_error; } #ifdef CONFIG_SECURITY_SMACK_BRINGUP static char *smk_bu_mess[] = { "Bringup Error", /* Unused */ "Bringup", /* SMACK_BRINGUP_ALLOW */ "Unconfined Subject", /* SMACK_UNCONFINED_SUBJECT */ "Unconfined Object", /* SMACK_UNCONFINED_OBJECT */ }; static void smk_bu_mode(int mode, char *s) { smack_str_from_perm(s, mode); } #endif #ifdef CONFIG_SECURITY_SMACK_BRINGUP static int smk_bu_note(char *note, struct smack_known *sskp, struct smack_known *oskp, int mode, int rc) { char acc[SMK_NUM_ACCESS_TYPE + 1]; if (rc <= 0) return rc; if (rc > SMACK_UNCONFINED_OBJECT) rc = 0; smk_bu_mode(mode, acc); pr_info("Smack %s: (%s %s %s) %s\n", smk_bu_mess[rc], sskp->smk_known, oskp->smk_known, acc, note); return 0; } #else #define smk_bu_note(note, sskp, oskp, mode, RC) (RC) #endif #ifdef CONFIG_SECURITY_SMACK_BRINGUP static int smk_bu_current(char *note, struct smack_known *oskp, int mode, int rc) { struct task_smack *tsp = smack_cred(current_cred()); char acc[SMK_NUM_ACCESS_TYPE + 1]; if (rc <= 0) return rc; if (rc > SMACK_UNCONFINED_OBJECT) rc = 0; smk_bu_mode(mode, acc); pr_info("Smack %s: (%s %s %s) %s %s\n", smk_bu_mess[rc], tsp->smk_task->smk_known, oskp->smk_known, acc, current->comm, note); return 0; } #else #define smk_bu_current(note, oskp, mode, RC) (RC) #endif #ifdef CONFIG_SECURITY_SMACK_BRINGUP static int smk_bu_task(struct task_struct *otp, int mode, int rc) { struct task_smack *tsp = smack_cred(current_cred()); struct smack_known *smk_task = smk_of_task_struct_obj(otp); char acc[SMK_NUM_ACCESS_TYPE + 1]; if (rc <= 0) return rc; if (rc > SMACK_UNCONFINED_OBJECT) rc = 0; smk_bu_mode(mode, acc); pr_info("Smack %s: (%s %s %s) %s to %s\n", smk_bu_mess[rc], tsp->smk_task->smk_known, smk_task->smk_known, acc, current->comm, otp->comm); return 0; } #else #define smk_bu_task(otp, mode, RC) (RC) #endif #ifdef CONFIG_SECURITY_SMACK_BRINGUP static int smk_bu_inode(struct inode *inode, int mode, int rc) { struct task_smack *tsp = smack_cred(current_cred()); struct inode_smack *isp = smack_inode(inode); char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) return rc; if (rc > SMACK_UNCONFINED_OBJECT) rc = 0; if (rc == SMACK_UNCONFINED_SUBJECT && (mode & (MAY_WRITE | MAY_APPEND))) isp->smk_flags |= SMK_INODE_IMPURE; smk_bu_mode(mode, acc); pr_info("Smack %s: (%s %s %s) inode=(%s %ld) %s\n", smk_bu_mess[rc], tsp->smk_task->smk_known, isp->smk_inode->smk_known, acc, inode->i_sb->s_id, inode->i_ino, current->comm); return 0; } #else #define smk_bu_inode(inode, mode, RC) (RC) #endif #ifdef CONFIG_SECURITY_SMACK_BRINGUP static int smk_bu_file(struct file *file, int mode, int rc) { struct task_smack *tsp = smack_cred(current_cred()); struct smack_known *sskp = tsp->smk_task; struct inode *inode = file_inode(file); struct inode_smack *isp = smack_inode(inode); char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) return rc; if (rc > SMACK_UNCONFINED_OBJECT) rc = 0; smk_bu_mode(mode, acc); pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc], sskp->smk_known, smk_of_inode(inode)->smk_known, acc, inode->i_sb->s_id, inode->i_ino, file, current->comm); return 0; } #else #define smk_bu_file(file, mode, RC) (RC) #endif #ifdef CONFIG_SECURITY_SMACK_BRINGUP static int smk_bu_credfile(const struct cred *cred, struct file *file, int mode, int rc) { struct task_smack *tsp = smack_cred(cred); struct smack_known *sskp = tsp->smk_task; struct inode *inode = file_inode(file); struct inode_smack *isp = smack_inode(inode); char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) return rc; if (rc > SMACK_UNCONFINED_OBJECT) rc = 0; smk_bu_mode(mode, acc); pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc], sskp->smk_known, smk_of_inode(inode)->smk_known, acc, inode->i_sb->s_id, inode->i_ino, file, current->comm); return 0; } #else #define smk_bu_credfile(cred, file, mode, RC) (RC) #endif /** * smk_fetch - Fetch the smack label from a file. * @name: type of the label (attribute) * @ip: a pointer to the inode * @dp: a pointer to the dentry * * Returns a pointer to the master list entry for the Smack label, * NULL if there was no label to fetch, or an error code. */ static struct smack_known *smk_fetch(const char *name, struct inode *ip, struct dentry *dp) { int rc; char *buffer; struct smack_known *skp = NULL; if (!(ip->i_opflags & IOP_XATTR)) return ERR_PTR(-EOPNOTSUPP); buffer = kzalloc(SMK_LONGLABEL, GFP_NOFS); if (buffer == NULL) return ERR_PTR(-ENOMEM); rc = __vfs_getxattr(dp, ip, name, buffer, SMK_LONGLABEL); if (rc < 0) skp = ERR_PTR(rc); else if (rc == 0) skp = NULL; else skp = smk_import_entry(buffer, rc); kfree(buffer); return skp; } /** * init_inode_smack - initialize an inode security blob * @inode: inode to extract the info from * @skp: a pointer to the Smack label entry to use in the blob * */ static void init_inode_smack(struct inode *inode, struct smack_known *skp) { struct inode_smack *isp = smack_inode(inode); isp->smk_inode = skp; isp->smk_flags = 0; } /** * init_task_smack - initialize a task security blob * @tsp: blob to initialize * @task: a pointer to the Smack label for the running task * @forked: a pointer to the Smack label for the forked task * */ static void init_task_smack(struct task_smack *tsp, struct smack_known *task, struct smack_known *forked) { tsp->smk_task = task; tsp->smk_forked = forked; INIT_LIST_HEAD(&tsp->smk_rules); INIT_LIST_HEAD(&tsp->smk_relabel); mutex_init(&tsp->smk_rules_lock); } /** * smk_copy_rules - copy a rule set * @nhead: new rules header pointer * @ohead: old rules header pointer * @gfp: type of the memory for the allocation * * Returns 0 on success, -ENOMEM on error */ static int smk_copy_rules(struct list_head *nhead, struct list_head *ohead, gfp_t gfp) { struct smack_rule *nrp; struct smack_rule *orp; int rc = 0; list_for_each_entry_rcu(orp, ohead, list) { nrp = kmem_cache_zalloc(smack_rule_cache, gfp); if (nrp == NULL) { rc = -ENOMEM; break; } *nrp = *orp; list_add_rcu(&nrp->list, nhead); } return rc; } /** * smk_copy_relabel - copy smk_relabel labels list * @nhead: new rules header pointer * @ohead: old rules header pointer * @gfp: type of the memory for the allocation * * Returns 0 on success, -ENOMEM on error */ static int smk_copy_relabel(struct list_head *nhead, struct list_head *ohead, gfp_t gfp) { struct smack_known_list_elem *nklep; struct smack_known_list_elem *oklep; list_for_each_entry(oklep, ohead, list) { nklep = kzalloc(sizeof(struct smack_known_list_elem), gfp); if (nklep == NULL) { smk_destroy_label_list(nhead); return -ENOMEM; } nklep->smk_label = oklep->smk_label; list_add(&nklep->list, nhead); } return 0; } /** * smk_ptrace_mode - helper function for converting PTRACE_MODE_* into MAY_* * @mode: input mode in form of PTRACE_MODE_* * * Returns a converted MAY_* mode usable by smack rules */ static inline unsigned int smk_ptrace_mode(unsigned int mode) { if (mode & PTRACE_MODE_ATTACH) return MAY_READWRITE; if (mode & PTRACE_MODE_READ) return MAY_READ; return 0; } /** * smk_ptrace_rule_check - helper for ptrace access * @tracer: tracer process * @tracee_known: label entry of the process that's about to be traced * @mode: ptrace attachment mode (PTRACE_MODE_*) * @func: name of the function that called us, used for audit * * Returns 0 on access granted, -error on error */ static int smk_ptrace_rule_check(struct task_struct *tracer, struct smack_known *tracee_known, unsigned int mode, const char *func) { int rc; struct smk_audit_info ad, *saip = NULL; struct task_smack *tsp; struct smack_known *tracer_known; const struct cred *tracercred; if ((mode & PTRACE_MODE_NOAUDIT) == 0) { smk_ad_init(&ad, func, LSM_AUDIT_DATA_TASK); smk_ad_setfield_u_tsk(&ad, tracer); saip = &ad; } rcu_read_lock(); tracercred = __task_cred(tracer); tsp = smack_cred(tracercred); tracer_known = smk_of_task(tsp); if ((mode & PTRACE_MODE_ATTACH) && (smack_ptrace_rule == SMACK_PTRACE_EXACT || smack_ptrace_rule == SMACK_PTRACE_DRACONIAN)) { if (tracer_known->smk_known == tracee_known->smk_known) rc = 0; else if (smack_ptrace_rule == SMACK_PTRACE_DRACONIAN) rc = -EACCES; else if (smack_privileged_cred(CAP_SYS_PTRACE, tracercred)) rc = 0; else rc = -EACCES; if (saip) smack_log(tracer_known->smk_known, tracee_known->smk_known, 0, rc, saip); rcu_read_unlock(); return rc; } /* In case of rule==SMACK_PTRACE_DEFAULT or mode==PTRACE_MODE_READ */ rc = smk_tskacc(tsp, tracee_known, smk_ptrace_mode(mode), saip); rcu_read_unlock(); return rc; } /* * LSM hooks. * We he, that is fun! */ /** * smack_ptrace_access_check - Smack approval on PTRACE_ATTACH * @ctp: child task pointer * @mode: ptrace attachment mode (PTRACE_MODE_*) * * Returns 0 if access is OK, an error code otherwise * * Do the capability checks. */ static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode) { struct smack_known *skp; skp = smk_of_task_struct_obj(ctp); return smk_ptrace_rule_check(current, skp, mode, __func__); } /** * smack_ptrace_traceme - Smack approval on PTRACE_TRACEME * @ptp: parent task pointer * * Returns 0 if access is OK, an error code otherwise * * Do the capability checks, and require PTRACE_MODE_ATTACH. */ static int smack_ptrace_traceme(struct task_struct *ptp) { struct smack_known *skp; skp = smk_of_task(smack_cred(current_cred())); return smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__); } /** * smack_syslog - Smack approval on syslog * @typefrom_file: unused * * Returns 0 on success, error code otherwise. */ static int smack_syslog(int typefrom_file) { int rc = 0; struct smack_known *skp = smk_of_current(); if (smack_privileged(CAP_MAC_OVERRIDE)) return 0; if (smack_syslog_label != NULL && smack_syslog_label != skp) rc = -EACCES; return rc; } /* * Superblock Hooks. */ /** * smack_sb_alloc_security - allocate a superblock blob * @sb: the superblock getting the blob * * Returns 0 on success or -ENOMEM on error. */ static int smack_sb_alloc_security(struct super_block *sb) { struct superblock_smack *sbsp = smack_superblock(sb); sbsp->smk_root = &smack_known_floor; sbsp->smk_default = &smack_known_floor; sbsp->smk_floor = &smack_known_floor; sbsp->smk_hat = &smack_known_hat; /* * SMK_SB_INITIALIZED will be zero from kzalloc. */ return 0; } struct smack_mnt_opts { const char *fsdefault; const char *fsfloor; const char *fshat; const char *fsroot; const char *fstransmute; }; static void smack_free_mnt_opts(void *mnt_opts) { kfree(mnt_opts); } static int smack_add_opt(int token, const char *s, void **mnt_opts) { struct smack_mnt_opts *opts = *mnt_opts; struct smack_known *skp; if (!opts) { opts = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL); if (!opts) return -ENOMEM; *mnt_opts = opts; } if (!s) return -ENOMEM; skp = smk_import_entry(s, 0); if (IS_ERR(skp)) return PTR_ERR(skp); switch (token) { case Opt_fsdefault: if (opts->fsdefault) goto out_opt_err; opts->fsdefault = skp->smk_known; break; case Opt_fsfloor: if (opts->fsfloor) goto out_opt_err; opts->fsfloor = skp->smk_known; break; case Opt_fshat: if (opts->fshat) goto out_opt_err; opts->fshat = skp->smk_known; break; case Opt_fsroot: if (opts->fsroot) goto out_opt_err; opts->fsroot = skp->smk_known; break; case Opt_fstransmute: if (opts->fstransmute) goto out_opt_err; opts->fstransmute = skp->smk_known; break; } return 0; out_opt_err: pr_warn("Smack: duplicate mount options\n"); return -EINVAL; } /** * smack_fs_context_submount - Initialise security data for a filesystem context * @fc: The filesystem context. * @reference: reference superblock * * Returns 0 on success or -ENOMEM on error. */ static int smack_fs_context_submount(struct fs_context *fc, struct super_block *reference) { struct superblock_smack *sbsp; struct smack_mnt_opts *ctx; struct inode_smack *isp; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->security = ctx; sbsp = smack_superblock(reference); isp = smack_inode(reference->s_root->d_inode); if (sbsp->smk_default) { ctx->fsdefault = kstrdup(sbsp->smk_default->smk_known, GFP_KERNEL); if (!ctx->fsdefault) return -ENOMEM; } if (sbsp->smk_floor) { ctx->fsfloor = kstrdup(sbsp->smk_floor->smk_known, GFP_KERNEL); if (!ctx->fsfloor) return -ENOMEM; } if (sbsp->smk_hat) { ctx->fshat = kstrdup(sbsp->smk_hat->smk_known, GFP_KERNEL); if (!ctx->fshat) return -ENOMEM; } if (isp->smk_flags & SMK_INODE_TRANSMUTE) { if (sbsp->smk_root) { ctx->fstransmute = kstrdup(sbsp->smk_root->smk_known, GFP_KERNEL); if (!ctx->fstransmute) return -ENOMEM; } } return 0; } /** * smack_fs_context_dup - Duplicate the security data on fs_context duplication * @fc: The new filesystem context. * @src_fc: The source filesystem context being duplicated. * * Returns 0 on success or -ENOMEM on error. */ static int smack_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { struct smack_mnt_opts *dst, *src = src_fc->security; if (!src) return 0; fc->security = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL); if (!fc->security) return -ENOMEM; dst = fc->security; dst->fsdefault = src->fsdefault; dst->fsfloor = src->fsfloor; dst->fshat = src->fshat; dst->fsroot = src->fsroot; dst->fstransmute = src->fstransmute; return 0; } static const struct fs_parameter_spec smack_fs_parameters[] = { fsparam_string("smackfsdef", Opt_fsdefault), fsparam_string("smackfsdefault", Opt_fsdefault), fsparam_string("smackfsfloor", Opt_fsfloor), fsparam_string("smackfshat", Opt_fshat), fsparam_string("smackfsroot", Opt_fsroot), fsparam_string("smackfstransmute", Opt_fstransmute), {} }; /** * smack_fs_context_parse_param - Parse a single mount parameter * @fc: The new filesystem context being constructed. * @param: The parameter. * * Returns 0 on success, -ENOPARAM to pass the parameter on or anything else on * error. */ static int smack_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; int opt, rc; opt = fs_parse(fc, smack_fs_parameters, param, &result); if (opt < 0) return opt; rc = smack_add_opt(opt, param->string, &fc->security); if (!rc) param->string = NULL; return rc; } static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts) { char *from = options, *to = options; bool first = true; while (1) { char *next = strchr(from, ','); int token, len, rc; char *arg = NULL; if (next) len = next - from; else len = strlen(from); token = match_opt_prefix(from, len, &arg); if (token != Opt_error) { arg = kmemdup_nul(arg, from + len - arg, GFP_KERNEL); rc = smack_add_opt(token, arg, mnt_opts); kfree(arg); if (unlikely(rc)) { if (*mnt_opts) smack_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; return rc; } } else { if (!first) { // copy with preceding comma from--; len++; } if (to != from) memmove(to, from, len); to += len; first = false; } if (!from[len]) break; from += len + 1; } *to = '\0'; return 0; } /** * smack_set_mnt_opts - set Smack specific mount options * @sb: the file system superblock * @mnt_opts: Smack mount options * @kern_flags: mount option from kernel space or user space * @set_kern_flags: where to store converted mount opts * * Returns 0 on success, an error code on failure * * Allow filesystems with binary mount data to explicitly set Smack mount * labels. */ static int smack_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { struct dentry *root = sb->s_root; struct inode *inode = d_backing_inode(root); struct superblock_smack *sp = smack_superblock(sb); struct inode_smack *isp; struct smack_known *skp; struct smack_mnt_opts *opts = mnt_opts; bool transmute = false; if (sp->smk_flags & SMK_SB_INITIALIZED) return 0; if (!smack_privileged(CAP_MAC_ADMIN)) { /* * Unprivileged mounts don't get to specify Smack values. */ if (opts) return -EPERM; /* * Unprivileged mounts get root and default from the caller. */ skp = smk_of_current(); sp->smk_root = skp; sp->smk_default = skp; /* * For a handful of fs types with no user-controlled * backing store it's okay to trust security labels * in the filesystem. The rest are untrusted. */ if (sb->s_user_ns != &init_user_ns && sb->s_magic != SYSFS_MAGIC && sb->s_magic != TMPFS_MAGIC && sb->s_magic != RAMFS_MAGIC) { transmute = true; sp->smk_flags |= SMK_SB_UNTRUSTED; } } sp->smk_flags |= SMK_SB_INITIALIZED; if (opts) { if (opts->fsdefault) { skp = smk_import_entry(opts->fsdefault, 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_default = skp; } if (opts->fsfloor) { skp = smk_import_entry(opts->fsfloor, 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_floor = skp; } if (opts->fshat) { skp = smk_import_entry(opts->fshat, 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_hat = skp; } if (opts->fsroot) { skp = smk_import_entry(opts->fsroot, 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_root = skp; } if (opts->fstransmute) { skp = smk_import_entry(opts->fstransmute, 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_root = skp; transmute = true; } } /* * Initialize the root inode. */ init_inode_smack(inode, sp->smk_root); if (transmute) { isp = smack_inode(inode); isp->smk_flags |= SMK_INODE_TRANSMUTE; } return 0; } /** * smack_sb_statfs - Smack check on statfs * @dentry: identifies the file system in question * * Returns 0 if current can read the floor of the filesystem, * and error code otherwise */ static int smack_sb_statfs(struct dentry *dentry) { struct superblock_smack *sbp = smack_superblock(dentry->d_sb); int rc; struct smk_audit_info ad; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(sbp->smk_floor, MAY_READ, &ad); rc = smk_bu_current("statfs", sbp->smk_floor, MAY_READ, rc); return rc; } /* * BPRM hooks */ /** * smack_bprm_creds_for_exec - Update bprm->cred if needed for exec * @bprm: the exec information * * Returns 0 if it gets a blob, -EPERM if exec forbidden and -ENOMEM otherwise */ static int smack_bprm_creds_for_exec(struct linux_binprm *bprm) { struct inode *inode = file_inode(bprm->file); struct task_smack *bsp = smack_cred(bprm->cred); struct inode_smack *isp; struct superblock_smack *sbsp; int rc; isp = smack_inode(inode); if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task) return 0; sbsp = smack_superblock(inode->i_sb); if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) && isp->smk_task != sbsp->smk_root) return 0; if (bprm->unsafe & LSM_UNSAFE_PTRACE) { struct task_struct *tracer; rc = 0; rcu_read_lock(); tracer = ptrace_parent(current); if (likely(tracer != NULL)) rc = smk_ptrace_rule_check(tracer, isp->smk_task, PTRACE_MODE_ATTACH, __func__); rcu_read_unlock(); if (rc != 0) return rc; } if (bprm->unsafe & ~LSM_UNSAFE_PTRACE) return -EPERM; bsp->smk_task = isp->smk_task; bprm->per_clear |= PER_CLEAR_ON_SETID; /* Decide if this is a secure exec. */ if (bsp->smk_task != bsp->smk_forked) bprm->secureexec = 1; return 0; } /* * Inode hooks */ /** * smack_inode_alloc_security - allocate an inode blob * @inode: the inode in need of a blob * * Returns 0 */ static int smack_inode_alloc_security(struct inode *inode) { struct smack_known *skp = smk_of_current(); init_inode_smack(inode, skp); return 0; } /** * smack_inode_init_security - copy out the smack from an inode * @inode: the newly created inode * @dir: containing directory object * @qstr: unused * @xattrs: where to put the attributes * @xattr_count: current number of LSM-provided xattrs (updated) * * Returns 0 if it all works out, -ENOMEM if there's no memory */ static int smack_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { struct task_smack *tsp = smack_cred(current_cred()); struct inode_smack *issp = smack_inode(inode); struct smack_known *skp = smk_of_task(tsp); struct smack_known *isp = smk_of_inode(inode); struct smack_known *dsp = smk_of_inode(dir); struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count); int may; /* * If equal, transmuting already occurred in * smack_dentry_create_files_as(). No need to check again. */ if (tsp->smk_task != tsp->smk_transmuted) { rcu_read_lock(); may = smk_access_entry(skp->smk_known, dsp->smk_known, &skp->smk_rules); rcu_read_unlock(); } /* * In addition to having smk_task equal to smk_transmuted, * if the access rule allows transmutation and the directory * requests transmutation then by all means transmute. * Mark the inode as changed. */ if ((tsp->smk_task == tsp->smk_transmuted) || (may > 0 && ((may & MAY_TRANSMUTE) != 0) && smk_inode_transmutable(dir))) { struct xattr *xattr_transmute; /* * The caller of smack_dentry_create_files_as() * should have overridden the current cred, so the * inode label was already set correctly in * smack_inode_alloc_security(). */ if (tsp->smk_task != tsp->smk_transmuted) isp = issp->smk_inode = dsp; issp->smk_flags |= SMK_INODE_TRANSMUTE; xattr_transmute = lsm_get_xattr_slot(xattrs, xattr_count); if (xattr_transmute) { xattr_transmute->value = kmemdup(TRANS_TRUE, TRANS_TRUE_SIZE, GFP_NOFS); if (!xattr_transmute->value) return -ENOMEM; xattr_transmute->value_len = TRANS_TRUE_SIZE; xattr_transmute->name = XATTR_SMACK_TRANSMUTE; } } issp->smk_flags |= SMK_INODE_INSTANT; if (xattr) { xattr->value = kstrdup(isp->smk_known, GFP_NOFS); if (!xattr->value) return -ENOMEM; xattr->value_len = strlen(isp->smk_known); xattr->name = XATTR_SMACK_SUFFIX; } return 0; } /** * smack_inode_link - Smack check on link * @old_dentry: the existing object * @dir: unused * @new_dentry: the new object * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { struct smack_known *isp; struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry); isp = smk_of_inode(d_backing_inode(old_dentry)); rc = smk_curacc(isp, MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(old_dentry), MAY_WRITE, rc); if (rc == 0 && d_is_positive(new_dentry)) { isp = smk_of_inode(d_backing_inode(new_dentry)); smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry); rc = smk_curacc(isp, MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(new_dentry), MAY_WRITE, rc); } return rc; } /** * smack_inode_unlink - Smack check on inode deletion * @dir: containing directory object * @dentry: file to unlink * * Returns 0 if current can write the containing directory * and the object, error code otherwise */ static int smack_inode_unlink(struct inode *dir, struct dentry *dentry) { struct inode *ip = d_backing_inode(dentry); struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); /* * You need write access to the thing you're unlinking */ rc = smk_curacc(smk_of_inode(ip), MAY_WRITE, &ad); rc = smk_bu_inode(ip, MAY_WRITE, rc); if (rc == 0) { /* * You also need write access to the containing directory */ smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE); smk_ad_setfield_u_fs_inode(&ad, dir); rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad); rc = smk_bu_inode(dir, MAY_WRITE, rc); } return rc; } /** * smack_inode_rmdir - Smack check on directory deletion * @dir: containing directory object * @dentry: directory to unlink * * Returns 0 if current can write the containing directory * and the directory, error code otherwise */ static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry) { struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); /* * You need write access to the thing you're removing */ rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); if (rc == 0) { /* * You also need write access to the containing directory */ smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE); smk_ad_setfield_u_fs_inode(&ad, dir); rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad); rc = smk_bu_inode(dir, MAY_WRITE, rc); } return rc; } /** * smack_inode_rename - Smack check on rename * @old_inode: unused * @old_dentry: the old object * @new_inode: unused * @new_dentry: the new object * * Read and write access is required on both the old and * new directories. * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry) { int rc; struct smack_known *isp; struct smk_audit_info ad; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry); isp = smk_of_inode(d_backing_inode(old_dentry)); rc = smk_curacc(isp, MAY_READWRITE, &ad); rc = smk_bu_inode(d_backing_inode(old_dentry), MAY_READWRITE, rc); if (rc == 0 && d_is_positive(new_dentry)) { isp = smk_of_inode(d_backing_inode(new_dentry)); smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry); rc = smk_curacc(isp, MAY_READWRITE, &ad); rc = smk_bu_inode(d_backing_inode(new_dentry), MAY_READWRITE, rc); } return rc; } /** * smack_inode_permission - Smack version of permission() * @inode: the inode in question * @mask: the access requested * * This is the important Smack hook. * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_permission(struct inode *inode, int mask) { struct superblock_smack *sbsp = smack_superblock(inode->i_sb); struct smk_audit_info ad; int no_block = mask & MAY_NOT_BLOCK; int rc; mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* * No permission to check. Existence test. Yup, it's there. */ if (mask == 0) return 0; if (sbsp->smk_flags & SMK_SB_UNTRUSTED) { if (smk_of_inode(inode) != sbsp->smk_root) return -EACCES; } /* May be droppable after audit */ if (no_block) return -ECHILD; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE); smk_ad_setfield_u_fs_inode(&ad, inode); rc = smk_curacc(smk_of_inode(inode), mask, &ad); rc = smk_bu_inode(inode, mask, rc); return rc; } /** * smack_inode_setattr - Smack check for setting attributes * @idmap: idmap of the mount * @dentry: the object * @iattr: for the force flag * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct smk_audit_info ad; int rc; /* * Need to allow for clearing the setuid bit. */ if (iattr->ia_valid & ATTR_FORCE) return 0; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); return rc; } /** * smack_inode_getattr - Smack check for getting attributes * @path: path to extract the info from * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_getattr(const struct path *path) { struct smk_audit_info ad; struct inode *inode = d_backing_inode(path->dentry); int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, *path); rc = smk_curacc(smk_of_inode(inode), MAY_READ, &ad); rc = smk_bu_inode(inode, MAY_READ, rc); return rc; } /** * smack_inode_xattr_skipcap - Skip the xattr capability checks? * @name: name of the xattr * * Returns 1 to indicate that Smack "owns" the access control rights to xattrs * named @name; the LSM layer should avoid enforcing any traditional * capability based access controls on this xattr. Returns 0 to indicate that * Smack does not "own" the access control rights to xattrs named @name and is * deferring to the LSM layer for further access controls, including capability * based controls. */ static int smack_inode_xattr_skipcap(const char *name) { if (strncmp(name, XATTR_SMACK_SUFFIX, strlen(XATTR_SMACK_SUFFIX))) return 0; if (strcmp(name, XATTR_NAME_SMACK) == 0 || strcmp(name, XATTR_NAME_SMACKIPIN) == 0 || strcmp(name, XATTR_NAME_SMACKIPOUT) == 0 || strcmp(name, XATTR_NAME_SMACKEXEC) == 0 || strcmp(name, XATTR_NAME_SMACKMMAP) == 0 || strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) return 1; return 0; } /** * smack_inode_setxattr - Smack check for setting xattrs * @idmap: idmap of the mount * @dentry: the object * @name: name of the attribute * @value: value of the attribute * @size: size of the value * @flags: unused * * This protects the Smack attribute explicitly. * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct smk_audit_info ad; struct smack_known *skp; int check_priv = 0; int check_import = 0; int check_star = 0; int rc = 0; /* * Check label validity here so import won't fail in post_setxattr */ if (strcmp(name, XATTR_NAME_SMACK) == 0 || strcmp(name, XATTR_NAME_SMACKIPIN) == 0 || strcmp(name, XATTR_NAME_SMACKIPOUT) == 0) { check_priv = 1; check_import = 1; } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0 || strcmp(name, XATTR_NAME_SMACKMMAP) == 0) { check_priv = 1; check_import = 1; check_star = 1; } else if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) { check_priv = 1; if (!S_ISDIR(d_backing_inode(dentry)->i_mode) || size != TRANS_TRUE_SIZE || strncmp(value, TRANS_TRUE, TRANS_TRUE_SIZE) != 0) rc = -EINVAL; } if (check_priv && !smack_privileged(CAP_MAC_ADMIN)) rc = -EPERM; if (rc == 0 && check_import) { skp = size ? smk_import_entry(value, size) : NULL; if (IS_ERR(skp)) rc = PTR_ERR(skp); else if (skp == NULL || (check_star && (skp == &smack_known_star || skp == &smack_known_web))) rc = -EINVAL; } smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); if (rc == 0) { rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); } return rc; } /** * smack_inode_post_setxattr - Apply the Smack update approved above * @dentry: object * @name: attribute name * @value: attribute value * @size: attribute size * @flags: unused * * Set the pointer in the inode blob to the entry found * in the master label list. */ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct smack_known *skp; struct inode_smack *isp = smack_inode(d_backing_inode(dentry)); if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) { isp->smk_flags |= SMK_INODE_TRANSMUTE; return; } if (strcmp(name, XATTR_NAME_SMACK) == 0) { skp = smk_import_entry(value, size); if (!IS_ERR(skp)) isp->smk_inode = skp; } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0) { skp = smk_import_entry(value, size); if (!IS_ERR(skp)) isp->smk_task = skp; } else if (strcmp(name, XATTR_NAME_SMACKMMAP) == 0) { skp = smk_import_entry(value, size); if (!IS_ERR(skp)) isp->smk_mmap = skp; } return; } /** * smack_inode_getxattr - Smack check on getxattr * @dentry: the object * @name: unused * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_getxattr(struct dentry *dentry, const char *name) { struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_READ, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_READ, rc); return rc; } /** * smack_inode_removexattr - Smack check on removexattr * @idmap: idmap of the mount * @dentry: the object * @name: name of the attribute * * Removing the Smack attribute requires CAP_MAC_ADMIN * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode_smack *isp; struct smk_audit_info ad; int rc = 0; if (strcmp(name, XATTR_NAME_SMACK) == 0 || strcmp(name, XATTR_NAME_SMACKIPIN) == 0 || strcmp(name, XATTR_NAME_SMACKIPOUT) == 0 || strcmp(name, XATTR_NAME_SMACKEXEC) == 0 || strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0 || strcmp(name, XATTR_NAME_SMACKMMAP) == 0) { if (!smack_privileged(CAP_MAC_ADMIN)) rc = -EPERM; } if (rc != 0) return rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); if (rc != 0) return rc; isp = smack_inode(d_backing_inode(dentry)); /* * Don't do anything special for these. * XATTR_NAME_SMACKIPIN * XATTR_NAME_SMACKIPOUT */ if (strcmp(name, XATTR_NAME_SMACK) == 0) { struct super_block *sbp = dentry->d_sb; struct superblock_smack *sbsp = smack_superblock(sbp); isp->smk_inode = sbsp->smk_default; } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0) isp->smk_task = NULL; else if (strcmp(name, XATTR_NAME_SMACKMMAP) == 0) isp->smk_mmap = NULL; else if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) isp->smk_flags &= ~SMK_INODE_TRANSMUTE; return 0; } /** * smack_inode_set_acl - Smack check for setting posix acls * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * @kacl: the posix acls * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); return rc; } /** * smack_inode_get_acl - Smack check for getting posix acls * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_READ, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_READ, rc); return rc; } /** * smack_inode_remove_acl - Smack check for getting posix acls * @idmap: idmap of the mnt this request came from * @dentry: the object * @acl_name: name of the posix acl * * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); return rc; } /** * smack_inode_getsecurity - get smack xattrs * @idmap: idmap of the mount * @inode: the object * @name: attribute name * @buffer: where to put the result * @alloc: duplicate memory * * Returns the size of the attribute or an error code */ static int smack_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { struct socket_smack *ssp; struct socket *sock; struct super_block *sbp; struct inode *ip = inode; struct smack_known *isp; struct inode_smack *ispp; size_t label_len; char *label = NULL; if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { isp = smk_of_inode(inode); } else if (strcmp(name, XATTR_SMACK_TRANSMUTE) == 0) { ispp = smack_inode(inode); if (ispp->smk_flags & SMK_INODE_TRANSMUTE) label = TRANS_TRUE; else label = ""; } else { /* * The rest of the Smack xattrs are only on sockets. */ sbp = ip->i_sb; if (sbp->s_magic != SOCKFS_MAGIC) return -EOPNOTSUPP; sock = SOCKET_I(ip); if (sock == NULL || sock->sk == NULL) return -EOPNOTSUPP; ssp = smack_sock(sock->sk); if (strcmp(name, XATTR_SMACK_IPIN) == 0) isp = ssp->smk_in; else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) isp = ssp->smk_out; else return -EOPNOTSUPP; } if (!label) label = isp->smk_known; label_len = strlen(label); if (alloc) { *buffer = kstrdup(label, GFP_KERNEL); if (*buffer == NULL) return -ENOMEM; } return label_len; } /** * smack_inode_listsecurity - list the Smack attributes * @inode: the object * @buffer: where they go * @buffer_size: size of buffer */ static int smack_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { int len = sizeof(XATTR_NAME_SMACK); if (buffer != NULL && len <= buffer_size) memcpy(buffer, XATTR_NAME_SMACK, len); return len; } /** * smack_inode_getlsmprop - Extract inode's security id * @inode: inode to extract the info from * @prop: where result will be saved */ static void smack_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop) { prop->smack.skp = smk_of_inode(inode); } /* * File Hooks */ /* * There is no smack_file_permission hook * * Should access checks be done on each read or write? * UNICOS and SELinux say yes. * Trusted Solaris, Trusted Irix, and just about everyone else says no. * * I'll say no for now. Smack does not do the frequent * label changing that SELinux does. */ /** * smack_file_alloc_security - assign a file security blob * @file: the object * * The security blob for a file is a pointer to the master * label list, so no allocation is done. * * f_security is the owner security information. It * isn't used on file access checks, it's for send_sigio. * * Returns 0 */ static int smack_file_alloc_security(struct file *file) { struct smack_known **blob = smack_file(file); *blob = smk_of_current(); return 0; } /** * smack_file_ioctl - Smack check on ioctls * @file: the object * @cmd: what to do * @arg: unused * * Relies heavily on the correct use of the ioctl command conventions. * * Returns 0 if allowed, error code otherwise */ static int smack_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int rc = 0; struct smk_audit_info ad; struct inode *inode = file_inode(file); if (unlikely(IS_PRIVATE(inode))) return 0; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); if (_IOC_DIR(cmd) & _IOC_WRITE) { rc = smk_curacc(smk_of_inode(inode), MAY_WRITE, &ad); rc = smk_bu_file(file, MAY_WRITE, rc); } if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ)) { rc = smk_curacc(smk_of_inode(inode), MAY_READ, &ad); rc = smk_bu_file(file, MAY_READ, rc); } return rc; } /** * smack_file_lock - Smack check on file locking * @file: the object * @cmd: unused * * Returns 0 if current has lock access, error code otherwise */ static int smack_file_lock(struct file *file, unsigned int cmd) { struct smk_audit_info ad; int rc; struct inode *inode = file_inode(file); if (unlikely(IS_PRIVATE(inode))) return 0; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); rc = smk_curacc(smk_of_inode(inode), MAY_LOCK, &ad); rc = smk_bu_file(file, MAY_LOCK, rc); return rc; } /** * smack_file_fcntl - Smack check on fcntl * @file: the object * @cmd: what action to check * @arg: unused * * Generally these operations are harmless. * File locking operations present an obvious mechanism * for passing information, so they require write access. * * Returns 0 if current has access, error code otherwise */ static int smack_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct smk_audit_info ad; int rc = 0; struct inode *inode = file_inode(file); if (unlikely(IS_PRIVATE(inode))) return 0; switch (cmd) { case F_GETLK: break; case F_SETLK: case F_SETLKW: smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); rc = smk_curacc(smk_of_inode(inode), MAY_LOCK, &ad); rc = smk_bu_file(file, MAY_LOCK, rc); break; case F_SETOWN: case F_SETSIG: smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); rc = smk_curacc(smk_of_inode(inode), MAY_WRITE, &ad); rc = smk_bu_file(file, MAY_WRITE, rc); break; default: break; } return rc; } /** * smack_mmap_file - Check permissions for a mmap operation. * @file: contains the file structure for file to map (may be NULL). * @reqprot: contains the protection requested by the application. * @prot: contains the protection that will be applied by the kernel. * @flags: contains the operational flags. * * The @file may be NULL, e.g. if mapping anonymous memory. * * Return 0 if permission is granted. */ static int smack_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { struct smack_known *skp; struct smack_known *mkp; struct smack_rule *srp; struct task_smack *tsp; struct smack_known *okp; struct inode_smack *isp; struct superblock_smack *sbsp; int may; int mmay; int tmay; int rc; if (file == NULL) return 0; if (unlikely(IS_PRIVATE(file_inode(file)))) return 0; isp = smack_inode(file_inode(file)); if (isp->smk_mmap == NULL) return 0; sbsp = smack_superblock(file_inode(file)->i_sb); if (sbsp->smk_flags & SMK_SB_UNTRUSTED && isp->smk_mmap != sbsp->smk_root) return -EACCES; mkp = isp->smk_mmap; tsp = smack_cred(current_cred()); skp = smk_of_current(); rc = 0; rcu_read_lock(); /* * For each Smack rule associated with the subject * label verify that the SMACK64MMAP also has access * to that rule's object label. */ list_for_each_entry_rcu(srp, &skp->smk_rules, list) { okp = srp->smk_object; /* * Matching labels always allows access. */ if (mkp->smk_known == okp->smk_known) continue; /* * If there is a matching local rule take * that into account as well. */ may = smk_access_entry(srp->smk_subject->smk_known, okp->smk_known, &tsp->smk_rules); if (may == -ENOENT) may = srp->smk_access; else may &= srp->smk_access; /* * If may is zero the SMACK64MMAP subject can't * possibly have less access. */ if (may == 0) continue; /* * Fetch the global list entry. * If there isn't one a SMACK64MMAP subject * can't have as much access as current. */ mmay = smk_access_entry(mkp->smk_known, okp->smk_known, &mkp->smk_rules); if (mmay == -ENOENT) { rc = -EACCES; break; } /* * If there is a local entry it modifies the * potential access, too. */ tmay = smk_access_entry(mkp->smk_known, okp->smk_known, &tsp->smk_rules); if (tmay != -ENOENT) mmay &= tmay; /* * If there is any access available to current that is * not available to a SMACK64MMAP subject * deny access. */ if ((may | mmay) != mmay) { rc = -EACCES; break; } } rcu_read_unlock(); return rc; } /** * smack_file_set_fowner - set the file security blob value * @file: object in question * */ static void smack_file_set_fowner(struct file *file) { struct smack_known **blob = smack_file(file); *blob = smk_of_current(); } /** * smack_file_send_sigiotask - Smack on sigio * @tsk: The target task * @fown: the object the signal come from * @signum: unused * * Allow a privileged task to get signals even if it shouldn't * * Returns 0 if a subject with the object's smack could * write to the task, an error code otherwise. */ static int smack_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { struct smack_known **blob; struct smack_known *skp; struct smack_known *tkp = smk_of_task(smack_cred(tsk->cred)); const struct cred *tcred; struct file *file; int rc; struct smk_audit_info ad; /* * struct fown_struct is never outside the context of a struct file */ file = fown->file; /* we don't log here as rc can be overriden */ blob = smack_file(file); skp = *blob; rc = smk_access(skp, tkp, MAY_DELIVER, NULL); rc = smk_bu_note("sigiotask", skp, tkp, MAY_DELIVER, rc); rcu_read_lock(); tcred = __task_cred(tsk); if (rc != 0 && smack_privileged_cred(CAP_MAC_OVERRIDE, tcred)) rc = 0; rcu_read_unlock(); smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); smk_ad_setfield_u_tsk(&ad, tsk); smack_log(skp->smk_known, tkp->smk_known, MAY_DELIVER, rc, &ad); return rc; } /** * smack_file_receive - Smack file receive check * @file: the object * * Returns 0 if current has access, error code otherwise */ static int smack_file_receive(struct file *file) { int rc; int may = 0; struct smk_audit_info ad; struct inode *inode = file_inode(file); struct socket *sock; struct task_smack *tsp; struct socket_smack *ssp; if (unlikely(IS_PRIVATE(inode))) return 0; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); if (inode->i_sb->s_magic == SOCKFS_MAGIC) { sock = SOCKET_I(inode); ssp = smack_sock(sock->sk); tsp = smack_cred(current_cred()); /* * If the receiving process can't write to the * passed socket or if the passed socket can't * write to the receiving process don't accept * the passed socket. */ rc = smk_access(tsp->smk_task, ssp->smk_out, MAY_WRITE, &ad); rc = smk_bu_file(file, may, rc); if (rc < 0) return rc; rc = smk_access(ssp->smk_in, tsp->smk_task, MAY_WRITE, &ad); rc = smk_bu_file(file, may, rc); return rc; } /* * This code relies on bitmasks. */ if (file->f_mode & FMODE_READ) may = MAY_READ; if (file->f_mode & FMODE_WRITE) may |= MAY_WRITE; rc = smk_curacc(smk_of_inode(inode), may, &ad); rc = smk_bu_file(file, may, rc); return rc; } /** * smack_file_open - Smack dentry open processing * @file: the object * * Set the security blob in the file structure. * Allow the open only if the task has read access. There are * many read operations (e.g. fstat) that you can do with an * fd even if you have the file open write-only. * * Returns 0 if current has access, error code otherwise */ static int smack_file_open(struct file *file) { struct task_smack *tsp = smack_cred(file->f_cred); struct inode *inode = file_inode(file); struct smk_audit_info ad; int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); rc = smk_tskacc(tsp, smk_of_inode(inode), MAY_READ, &ad); rc = smk_bu_credfile(file->f_cred, file, MAY_READ, rc); return rc; } /* * Task hooks */ /** * smack_cred_alloc_blank - "allocate" blank task-level security credentials * @cred: the new credentials * @gfp: the atomicity of any memory allocations * * Prepare a blank set of credentials for modification. This must allocate all * the memory the LSM module might require such that cred_transfer() can * complete without error. */ static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp) { init_task_smack(smack_cred(cred), NULL, NULL); return 0; } /** * smack_cred_free - "free" task-level security credentials * @cred: the credentials in question * */ static void smack_cred_free(struct cred *cred) { struct task_smack *tsp = smack_cred(cred); struct smack_rule *rp; struct list_head *l; struct list_head *n; smk_destroy_label_list(&tsp->smk_relabel); list_for_each_safe(l, n, &tsp->smk_rules) { rp = list_entry(l, struct smack_rule, list); list_del(&rp->list); kmem_cache_free(smack_rule_cache, rp); } } /** * smack_cred_prepare - prepare new set of credentials for modification * @new: the new credentials * @old: the original credentials * @gfp: the atomicity of any memory allocations * * Prepare a new set of credentials for modification. */ static int smack_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { struct task_smack *old_tsp = smack_cred(old); struct task_smack *new_tsp = smack_cred(new); int rc; init_task_smack(new_tsp, old_tsp->smk_task, old_tsp->smk_task); rc = smk_copy_rules(&new_tsp->smk_rules, &old_tsp->smk_rules, gfp); if (rc != 0) return rc; rc = smk_copy_relabel(&new_tsp->smk_relabel, &old_tsp->smk_relabel, gfp); return rc; } /** * smack_cred_transfer - Transfer the old credentials to the new credentials * @new: the new credentials * @old: the original credentials * * Fill in a set of blank credentials from another set of credentials. */ static void smack_cred_transfer(struct cred *new, const struct cred *old) { struct task_smack *old_tsp = smack_cred(old); struct task_smack *new_tsp = smack_cred(new); init_task_smack(new_tsp, old_tsp->smk_task, old_tsp->smk_task); } /** * smack_cred_getsecid - get the secid corresponding to a creds structure * @cred: the object creds * @secid: where to put the result * * Sets the secid to contain a u32 version of the smack label. */ static void smack_cred_getsecid(const struct cred *cred, u32 *secid) { struct smack_known *skp; rcu_read_lock(); skp = smk_of_task(smack_cred(cred)); *secid = skp->smk_secid; rcu_read_unlock(); } /** * smack_cred_getlsmprop - get the Smack label for a creds structure * @cred: the object creds * @prop: where to put the data * * Sets the Smack part of the ref */ static void smack_cred_getlsmprop(const struct cred *cred, struct lsm_prop *prop) { rcu_read_lock(); prop->smack.skp = smk_of_task(smack_cred(cred)); rcu_read_unlock(); } /** * smack_kernel_act_as - Set the subjective context in a set of credentials * @new: points to the set of credentials to be modified. * @secid: specifies the security ID to be set * * Set the security data for a kernel service. */ static int smack_kernel_act_as(struct cred *new, u32 secid) { struct task_smack *new_tsp = smack_cred(new); new_tsp->smk_task = smack_from_secid(secid); return 0; } /** * smack_kernel_create_files_as - Set the file creation label in a set of creds * @new: points to the set of credentials to be modified * @inode: points to the inode to use as a reference * * Set the file creation context in a set of credentials to the same * as the objective context of the specified inode */ static int smack_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_smack *isp = smack_inode(inode); struct task_smack *tsp = smack_cred(new); tsp->smk_forked = isp->smk_inode; tsp->smk_task = tsp->smk_forked; return 0; } /** * smk_curacc_on_task - helper to log task related access * @p: the task object * @access: the access requested * @caller: name of the calling function for audit * * Return 0 if access is permitted */ static int smk_curacc_on_task(struct task_struct *p, int access, const char *caller) { struct smk_audit_info ad; struct smack_known *skp = smk_of_task_struct_obj(p); int rc; smk_ad_init(&ad, caller, LSM_AUDIT_DATA_TASK); smk_ad_setfield_u_tsk(&ad, p); rc = smk_curacc(skp, access, &ad); rc = smk_bu_task(p, access, rc); return rc; } /** * smack_task_setpgid - Smack check on setting pgid * @p: the task object * @pgid: unused * * Return 0 if write access is permitted */ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) { return smk_curacc_on_task(p, MAY_WRITE, __func__); } /** * smack_task_getpgid - Smack access check for getpgid * @p: the object task * * Returns 0 if current can read the object task, error code otherwise */ static int smack_task_getpgid(struct task_struct *p) { return smk_curacc_on_task(p, MAY_READ, __func__); } /** * smack_task_getsid - Smack access check for getsid * @p: the object task * * Returns 0 if current can read the object task, error code otherwise */ static int smack_task_getsid(struct task_struct *p) { return smk_curacc_on_task(p, MAY_READ, __func__); } /** * smack_current_getlsmprop_subj - get the subjective secid of the current task * @prop: where to put the result * * Sets the secid to contain a u32 version of the task's subjective smack label. */ static void smack_current_getlsmprop_subj(struct lsm_prop *prop) { prop->smack.skp = smk_of_current(); } /** * smack_task_getlsmprop_obj - get the objective data of the task * @p: the task * @prop: where to put the result * * Sets the secid to contain a u32 version of the task's objective smack label. */ static void smack_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop) { prop->smack.skp = smk_of_task_struct_obj(p); } /** * smack_task_setnice - Smack check on setting nice * @p: the task object * @nice: unused * * Return 0 if write access is permitted */ static int smack_task_setnice(struct task_struct *p, int nice) { return smk_curacc_on_task(p, MAY_WRITE, __func__); } /** * smack_task_setioprio - Smack check on setting ioprio * @p: the task object * @ioprio: unused * * Return 0 if write access is permitted */ static int smack_task_setioprio(struct task_struct *p, int ioprio) { return smk_curacc_on_task(p, MAY_WRITE, __func__); } /** * smack_task_getioprio - Smack check on reading ioprio * @p: the task object * * Return 0 if read access is permitted */ static int smack_task_getioprio(struct task_struct *p) { return smk_curacc_on_task(p, MAY_READ, __func__); } /** * smack_task_setscheduler - Smack check on setting scheduler * @p: the task object * * Return 0 if read access is permitted */ static int smack_task_setscheduler(struct task_struct *p) { return smk_curacc_on_task(p, MAY_WRITE, __func__); } /** * smack_task_getscheduler - Smack check on reading scheduler * @p: the task object * * Return 0 if read access is permitted */ static int smack_task_getscheduler(struct task_struct *p) { return smk_curacc_on_task(p, MAY_READ, __func__); } /** * smack_task_movememory - Smack check on moving memory * @p: the task object * * Return 0 if write access is permitted */ static int smack_task_movememory(struct task_struct *p) { return smk_curacc_on_task(p, MAY_WRITE, __func__); } /** * smack_task_kill - Smack check on signal delivery * @p: the task object * @info: unused * @sig: unused * @cred: identifies the cred to use in lieu of current's * * Return 0 if write access is permitted * */ static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { struct smk_audit_info ad; struct smack_known *skp; struct smack_known *tkp = smk_of_task_struct_obj(p); int rc; if (!sig) return 0; /* null signal; existence test */ smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); smk_ad_setfield_u_tsk(&ad, p); /* * Sending a signal requires that the sender * can write the receiver. */ if (cred == NULL) { rc = smk_curacc(tkp, MAY_DELIVER, &ad); rc = smk_bu_task(p, MAY_DELIVER, rc); return rc; } /* * If the cred isn't NULL we're dealing with some USB IO * specific behavior. This is not clean. For one thing * we can't take privilege into account. */ skp = smk_of_task(smack_cred(cred)); rc = smk_access(skp, tkp, MAY_DELIVER, &ad); rc = smk_bu_note("USB signal", skp, tkp, MAY_DELIVER, rc); return rc; } /** * smack_task_to_inode - copy task smack into the inode blob * @p: task to copy from * @inode: inode to copy to * * Sets the smack pointer in the inode security blob */ static void smack_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_smack *isp = smack_inode(inode); struct smack_known *skp = smk_of_task_struct_obj(p); isp->smk_inode = skp; isp->smk_flags |= SMK_INODE_INSTANT; } /* * Socket hooks. */ /** * smack_sk_alloc_security - Allocate a socket blob * @sk: the socket * @family: unused * @gfp_flags: memory allocation flags * * Assign Smack pointers to current * * Returns 0 on success, -ENOMEM is there's no memory */ static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags) { struct smack_known *skp = smk_of_current(); struct socket_smack *ssp = smack_sock(sk); /* * Sockets created by kernel threads receive web label. */ if (unlikely(current->flags & PF_KTHREAD)) { ssp->smk_in = &smack_known_web; ssp->smk_out = &smack_known_web; } else { ssp->smk_in = skp; ssp->smk_out = skp; } ssp->smk_packet = NULL; return 0; } #ifdef SMACK_IPV6_PORT_LABELING /** * smack_sk_free_security - Free a socket blob * @sk: the socket * * Clears the blob pointer */ static void smack_sk_free_security(struct sock *sk) { struct smk_port_label *spp; if (sk->sk_family == PF_INET6) { rcu_read_lock(); list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) { if (spp->smk_sock != sk) continue; spp->smk_can_reuse = 1; break; } rcu_read_unlock(); } } #endif /** * smack_sk_clone_security - Copy security context * @sk: the old socket * @newsk: the new socket * * Copy the security context of the old socket pointer to the cloned */ static void smack_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct socket_smack *ssp_old = smack_sock(sk); struct socket_smack *ssp_new = smack_sock(newsk); *ssp_new = *ssp_old; } /** * smack_ipv4host_label - check host based restrictions * @sip: the object end * * looks for host based access restrictions * * This version will only be appropriate for really small sets of single label * hosts. The caller is responsible for ensuring that the RCU read lock is * taken before calling this function. * * Returns the label of the far end or NULL if it's not special. */ static struct smack_known *smack_ipv4host_label(struct sockaddr_in *sip) { struct smk_net4addr *snp; struct in_addr *siap = &sip->sin_addr; if (siap->s_addr == 0) return NULL; list_for_each_entry_rcu(snp, &smk_net4addr_list, list) /* * we break after finding the first match because * the list is sorted from longest to shortest mask * so we have found the most specific match */ if (snp->smk_host.s_addr == (siap->s_addr & snp->smk_mask.s_addr)) return snp->smk_label; return NULL; } /* * smk_ipv6_localhost - Check for local ipv6 host address * @sip: the address * * Returns boolean true if this is the localhost address */ static bool smk_ipv6_localhost(struct sockaddr_in6 *sip) { __be16 *be16p = (__be16 *)&sip->sin6_addr; __be32 *be32p = (__be32 *)&sip->sin6_addr; if (be32p[0] == 0 && be32p[1] == 0 && be32p[2] == 0 && be16p[6] == 0 && ntohs(be16p[7]) == 1) return true; return false; } /** * smack_ipv6host_label - check host based restrictions * @sip: the object end * * looks for host based access restrictions * * This version will only be appropriate for really small sets of single label * hosts. The caller is responsible for ensuring that the RCU read lock is * taken before calling this function. * * Returns the label of the far end or NULL if it's not special. */ static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip) { struct smk_net6addr *snp; struct in6_addr *sap = &sip->sin6_addr; int i; int found = 0; /* * It's local. Don't look for a host label. */ if (smk_ipv6_localhost(sip)) return NULL; list_for_each_entry_rcu(snp, &smk_net6addr_list, list) { /* * If the label is NULL the entry has * been renounced. Ignore it. */ if (snp->smk_label == NULL) continue; /* * we break after finding the first match because * the list is sorted from longest to shortest mask * so we have found the most specific match */ for (found = 1, i = 0; i < 8; i++) { if ((sap->s6_addr16[i] & snp->smk_mask.s6_addr16[i]) != snp->smk_host.s6_addr16[i]) { found = 0; break; } } if (found) return snp->smk_label; } return NULL; } /** * smack_netlbl_add - Set the secattr on a socket * @sk: the socket * * Attach the outbound smack value (smk_out) to the socket. * * Returns 0 on success or an error code */ static int smack_netlbl_add(struct sock *sk) { struct socket_smack *ssp = smack_sock(sk); struct smack_known *skp = ssp->smk_out; int rc; local_bh_disable(); bh_lock_sock_nested(sk); rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel, netlbl_sk_lock_check(sk)); switch (rc) { case 0: ssp->smk_state = SMK_NETLBL_LABELED; break; case -EDESTADDRREQ: ssp->smk_state = SMK_NETLBL_REQSKB; rc = 0; break; } bh_unlock_sock(sk); local_bh_enable(); return rc; } /** * smack_netlbl_delete - Remove the secattr from a socket * @sk: the socket * * Remove the outbound smack value from a socket */ static void smack_netlbl_delete(struct sock *sk) { struct socket_smack *ssp = smack_sock(sk); /* * Take the label off the socket if one is set. */ if (ssp->smk_state != SMK_NETLBL_LABELED) return; local_bh_disable(); bh_lock_sock_nested(sk); netlbl_sock_delattr(sk); bh_unlock_sock(sk); local_bh_enable(); ssp->smk_state = SMK_NETLBL_UNLABELED; } /** * smk_ipv4_check - Perform IPv4 host access checks * @sk: the socket * @sap: the destination address * * Set the correct secattr for the given socket based on the destination * address and perform any outbound access checks needed. * * Returns 0 on success or an error code. * */ static int smk_ipv4_check(struct sock *sk, struct sockaddr_in *sap) { struct smack_known *skp; int rc = 0; struct smack_known *hkp; struct socket_smack *ssp = smack_sock(sk); struct smk_audit_info ad; rcu_read_lock(); hkp = smack_ipv4host_label(sap); if (hkp != NULL) { #ifdef CONFIG_AUDIT struct lsm_network_audit net; smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = sap->sin_family; ad.a.u.net->dport = sap->sin_port; ad.a.u.net->v4info.daddr = sap->sin_addr.s_addr; #endif skp = ssp->smk_out; rc = smk_access(skp, hkp, MAY_WRITE, &ad); rc = smk_bu_note("IPv4 host check", skp, hkp, MAY_WRITE, rc); /* * Clear the socket netlabel if it's set. */ if (!rc) smack_netlbl_delete(sk); } rcu_read_unlock(); return rc; } /** * smk_ipv6_check - check Smack access * @subject: subject Smack label * @object: object Smack label * @address: address * @act: the action being taken * * Check an IPv6 access */ static int smk_ipv6_check(struct smack_known *subject, struct smack_known *object, struct sockaddr_in6 *address, int act) { #ifdef CONFIG_AUDIT struct lsm_network_audit net; #endif struct smk_audit_info ad; int rc; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = PF_INET6; ad.a.u.net->dport = address->sin6_port; if (act == SMK_RECEIVING) ad.a.u.net->v6info.saddr = address->sin6_addr; else ad.a.u.net->v6info.daddr = address->sin6_addr; #endif rc = smk_access(subject, object, MAY_WRITE, &ad); rc = smk_bu_note("IPv6 check", subject, object, MAY_WRITE, rc); return rc; } #ifdef SMACK_IPV6_PORT_LABELING /** * smk_ipv6_port_label - Smack port access table management * @sock: socket * @address: address * * Create or update the port list entry */ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) { struct sock *sk = sock->sk; struct sockaddr_in6 *addr6; struct socket_smack *ssp = smack_sock(sock->sk); struct smk_port_label *spp; unsigned short port = 0; if (address == NULL) { /* * This operation is changing the Smack information * on the bound socket. Take the changes to the port * as well. */ rcu_read_lock(); list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) { if (sk != spp->smk_sock) continue; spp->smk_in = ssp->smk_in; spp->smk_out = ssp->smk_out; rcu_read_unlock(); return; } /* * A NULL address is only used for updating existing * bound entries. If there isn't one, it's OK. */ rcu_read_unlock(); return; } addr6 = (struct sockaddr_in6 *)address; port = ntohs(addr6->sin6_port); /* * This is a special case that is safely ignored. */ if (port == 0) return; /* * Look for an existing port list entry. * This is an indication that a port is getting reused. */ rcu_read_lock(); list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) { if (spp->smk_port != port || spp->smk_sock_type != sock->type) continue; if (spp->smk_can_reuse != 1) { rcu_read_unlock(); return; } spp->smk_port = port; spp->smk_sock = sk; spp->smk_in = ssp->smk_in; spp->smk_out = ssp->smk_out; spp->smk_can_reuse = 0; rcu_read_unlock(); return; } rcu_read_unlock(); /* * A new port entry is required. */ spp = kzalloc(sizeof(*spp), GFP_KERNEL); if (spp == NULL) return; spp->smk_port = port; spp->smk_sock = sk; spp->smk_in = ssp->smk_in; spp->smk_out = ssp->smk_out; spp->smk_sock_type = sock->type; spp->smk_can_reuse = 0; mutex_lock(&smack_ipv6_lock); list_add_rcu(&spp->list, &smk_ipv6_port_list); mutex_unlock(&smack_ipv6_lock); return; } /** * smk_ipv6_port_check - check Smack port access * @sk: socket * @address: address * @act: the action being taken * * Create or update the port list entry */ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, int act) { struct smk_port_label *spp; struct socket_smack *ssp = smack_sock(sk); struct smack_known *skp = NULL; unsigned short port; struct smack_known *object; if (act == SMK_RECEIVING) { skp = smack_ipv6host_label(address); object = ssp->smk_in; } else { skp = ssp->smk_out; object = smack_ipv6host_label(address); } /* * The other end is a single label host. */ if (skp != NULL && object != NULL) return smk_ipv6_check(skp, object, address, act); if (skp == NULL) skp = smack_net_ambient; if (object == NULL) object = smack_net_ambient; /* * It's remote, so port lookup does no good. */ if (!smk_ipv6_localhost(address)) return smk_ipv6_check(skp, object, address, act); /* * It's local so the send check has to have passed. */ if (act == SMK_RECEIVING) return 0; port = ntohs(address->sin6_port); rcu_read_lock(); list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) { if (spp->smk_port != port || spp->smk_sock_type != sk->sk_type) continue; object = spp->smk_in; if (act == SMK_CONNECTING) ssp->smk_packet = spp->smk_out; break; } rcu_read_unlock(); return smk_ipv6_check(skp, object, address, act); } #endif /** * smack_inode_setsecurity - set smack xattrs * @inode: the object * @name: attribute name * @value: attribute value * @size: size of the attribute * @flags: unused * * Sets the named attribute in the appropriate blob * * Returns 0 on success, or an error code */ static int smack_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct smack_known *skp; struct inode_smack *nsp = smack_inode(inode); struct socket_smack *ssp; struct socket *sock; int rc = 0; if (value == NULL || size > SMK_LONGLABEL || size == 0) return -EINVAL; if (strcmp(name, XATTR_SMACK_TRANSMUTE) == 0) { if (!S_ISDIR(inode->i_mode) || size != TRANS_TRUE_SIZE || strncmp(value, TRANS_TRUE, TRANS_TRUE_SIZE) != 0) return -EINVAL; nsp->smk_flags |= SMK_INODE_TRANSMUTE; return 0; } skp = smk_import_entry(value, size); if (IS_ERR(skp)) return PTR_ERR(skp); if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { nsp->smk_inode = skp; nsp->smk_flags |= SMK_INODE_INSTANT; return 0; } /* * The rest of the Smack xattrs are only on sockets. */ if (inode->i_sb->s_magic != SOCKFS_MAGIC) return -EOPNOTSUPP; sock = SOCKET_I(inode); if (sock == NULL || sock->sk == NULL) return -EOPNOTSUPP; ssp = smack_sock(sock->sk); if (strcmp(name, XATTR_SMACK_IPIN) == 0) ssp->smk_in = skp; else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) { ssp->smk_out = skp; if (sock->sk->sk_family == PF_INET) { rc = smack_netlbl_add(sock->sk); if (rc != 0) printk(KERN_WARNING "Smack: \"%s\" netlbl error %d.\n", __func__, -rc); } } else return -EOPNOTSUPP; #ifdef SMACK_IPV6_PORT_LABELING if (sock->sk->sk_family == PF_INET6) smk_ipv6_port_label(sock, NULL); #endif return 0; } /** * smack_socket_post_create - finish socket setup * @sock: the socket * @family: protocol family * @type: unused * @protocol: unused * @kern: unused * * Sets the netlabel information on the socket * * Returns 0 on success, and error code otherwise */ static int smack_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct socket_smack *ssp; if (sock->sk == NULL) return 0; /* * Sockets created by kernel threads receive web label. */ if (unlikely(current->flags & PF_KTHREAD)) { ssp = smack_sock(sock->sk); ssp->smk_in = &smack_known_web; ssp->smk_out = &smack_known_web; } if (family != PF_INET) return 0; /* * Set the outbound netlbl. */ return smack_netlbl_add(sock->sk); } /** * smack_socket_socketpair - create socket pair * @socka: one socket * @sockb: another socket * * Cross reference the peer labels for SO_PEERSEC * * Returns 0 */ static int smack_socket_socketpair(struct socket *socka, struct socket *sockb) { struct socket_smack *asp = smack_sock(socka->sk); struct socket_smack *bsp = smack_sock(sockb->sk); asp->smk_packet = bsp->smk_out; bsp->smk_packet = asp->smk_out; return 0; } #ifdef SMACK_IPV6_PORT_LABELING /** * smack_socket_bind - record port binding information. * @sock: the socket * @address: the port address * @addrlen: size of the address * * Records the label bound to a port. * * Returns 0 on success, and error code otherwise */ static int smack_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { if (sock->sk != NULL && sock->sk->sk_family == PF_INET6) { if (addrlen < SIN6_LEN_RFC2133 || address->sa_family != AF_INET6) return -EINVAL; smk_ipv6_port_label(sock, address); } return 0; } #endif /* SMACK_IPV6_PORT_LABELING */ /** * smack_socket_connect - connect access check * @sock: the socket * @sap: the other end * @addrlen: size of sap * * Verifies that a connection may be possible * * Returns 0 on success, and error code otherwise */ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, int addrlen) { int rc = 0; if (sock->sk == NULL) return 0; if (sock->sk->sk_family != PF_INET && (!IS_ENABLED(CONFIG_IPV6) || sock->sk->sk_family != PF_INET6)) return 0; if (addrlen < offsetofend(struct sockaddr, sa_family)) return 0; if (IS_ENABLED(CONFIG_IPV6) && sap->sa_family == AF_INET6) { struct sockaddr_in6 *sip = (struct sockaddr_in6 *)sap; struct smack_known *rsp = NULL; if (addrlen < SIN6_LEN_RFC2133) return 0; if (__is_defined(SMACK_IPV6_SECMARK_LABELING)) rsp = smack_ipv6host_label(sip); if (rsp != NULL) { struct socket_smack *ssp = smack_sock(sock->sk); rc = smk_ipv6_check(ssp->smk_out, rsp, sip, SMK_CONNECTING); } #ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); #endif return rc; } if (sap->sa_family != AF_INET || addrlen < sizeof(struct sockaddr_in)) return 0; rc = smk_ipv4_check(sock->sk, (struct sockaddr_in *)sap); return rc; } /** * smack_flags_to_may - convert S_ to MAY_ values * @flags: the S_ value * * Returns the equivalent MAY_ value */ static int smack_flags_to_may(int flags) { int may = 0; if (flags & S_IRUGO) may |= MAY_READ; if (flags & S_IWUGO) may |= MAY_WRITE; if (flags & S_IXUGO) may |= MAY_EXEC; return may; } /** * smack_msg_msg_alloc_security - Set the security blob for msg_msg * @msg: the object * * Returns 0 */ static int smack_msg_msg_alloc_security(struct msg_msg *msg) { struct smack_known **blob = smack_msg_msg(msg); *blob = smk_of_current(); return 0; } /** * smack_of_ipc - the smack pointer for the ipc * @isp: the object * * Returns a pointer to the smack value */ static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp) { struct smack_known **blob = smack_ipc(isp); return *blob; } /** * smack_ipc_alloc_security - Set the security blob for ipc * @isp: the object * * Returns 0 */ static int smack_ipc_alloc_security(struct kern_ipc_perm *isp) { struct smack_known **blob = smack_ipc(isp); *blob = smk_of_current(); return 0; } /** * smk_curacc_shm : check if current has access on shm * @isp : the object * @access : access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smk_curacc_shm(struct kern_ipc_perm *isp, int access) { struct smack_known *ssp = smack_of_ipc(isp); struct smk_audit_info ad; int rc; #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); ad.a.u.ipc_id = isp->id; #endif rc = smk_curacc(ssp, access, &ad); rc = smk_bu_current("shm", ssp, access, rc); return rc; } /** * smack_shm_associate - Smack access check for shm * @isp: the object * @shmflg: access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smack_shm_associate(struct kern_ipc_perm *isp, int shmflg) { int may; may = smack_flags_to_may(shmflg); return smk_curacc_shm(isp, may); } /** * smack_shm_shmctl - Smack access check for shm * @isp: the object * @cmd: what it wants to do * * Returns 0 if current has the requested access, error code otherwise */ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) { int may; switch (cmd) { case IPC_STAT: case SHM_STAT: case SHM_STAT_ANY: may = MAY_READ; break; case IPC_SET: case SHM_LOCK: case SHM_UNLOCK: case IPC_RMID: may = MAY_READWRITE; break; case IPC_INFO: case SHM_INFO: /* * System level information. */ return 0; default: return -EINVAL; } return smk_curacc_shm(isp, may); } /** * smack_shm_shmat - Smack access for shmat * @isp: the object * @shmaddr: unused * @shmflg: access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smack_shm_shmat(struct kern_ipc_perm *isp, char __user *shmaddr, int shmflg) { int may; may = smack_flags_to_may(shmflg); return smk_curacc_shm(isp, may); } /** * smk_curacc_sem : check if current has access on sem * @isp : the object * @access : access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smk_curacc_sem(struct kern_ipc_perm *isp, int access) { struct smack_known *ssp = smack_of_ipc(isp); struct smk_audit_info ad; int rc; #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); ad.a.u.ipc_id = isp->id; #endif rc = smk_curacc(ssp, access, &ad); rc = smk_bu_current("sem", ssp, access, rc); return rc; } /** * smack_sem_associate - Smack access check for sem * @isp: the object * @semflg: access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smack_sem_associate(struct kern_ipc_perm *isp, int semflg) { int may; may = smack_flags_to_may(semflg); return smk_curacc_sem(isp, may); } /** * smack_sem_semctl - Smack access check for sem * @isp: the object * @cmd: what it wants to do * * Returns 0 if current has the requested access, error code otherwise */ static int smack_sem_semctl(struct kern_ipc_perm *isp, int cmd) { int may; switch (cmd) { case GETPID: case GETNCNT: case GETZCNT: case GETVAL: case GETALL: case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: may = MAY_READ; break; case SETVAL: case SETALL: case IPC_RMID: case IPC_SET: may = MAY_READWRITE; break; case IPC_INFO: case SEM_INFO: /* * System level information */ return 0; default: return -EINVAL; } return smk_curacc_sem(isp, may); } /** * smack_sem_semop - Smack checks of semaphore operations * @isp: the object * @sops: unused * @nsops: unused * @alter: unused * * Treated as read and write in all cases. * * Returns 0 if access is allowed, error code otherwise */ static int smack_sem_semop(struct kern_ipc_perm *isp, struct sembuf *sops, unsigned nsops, int alter) { return smk_curacc_sem(isp, MAY_READWRITE); } /** * smk_curacc_msq : helper to check if current has access on msq * @isp : the msq * @access : access requested * * return 0 if current has access, error otherwise */ static int smk_curacc_msq(struct kern_ipc_perm *isp, int access) { struct smack_known *msp = smack_of_ipc(isp); struct smk_audit_info ad; int rc; #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); ad.a.u.ipc_id = isp->id; #endif rc = smk_curacc(msp, access, &ad); rc = smk_bu_current("msq", msp, access, rc); return rc; } /** * smack_msg_queue_associate - Smack access check for msg_queue * @isp: the object * @msqflg: access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smack_msg_queue_associate(struct kern_ipc_perm *isp, int msqflg) { int may; may = smack_flags_to_may(msqflg); return smk_curacc_msq(isp, may); } /** * smack_msg_queue_msgctl - Smack access check for msg_queue * @isp: the object * @cmd: what it wants to do * * Returns 0 if current has the requested access, error code otherwise */ static int smack_msg_queue_msgctl(struct kern_ipc_perm *isp, int cmd) { int may; switch (cmd) { case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: may = MAY_READ; break; case IPC_SET: case IPC_RMID: may = MAY_READWRITE; break; case IPC_INFO: case MSG_INFO: /* * System level information */ return 0; default: return -EINVAL; } return smk_curacc_msq(isp, may); } /** * smack_msg_queue_msgsnd - Smack access check for msg_queue * @isp: the object * @msg: unused * @msqflg: access requested * * Returns 0 if current has the requested access, error code otherwise */ static int smack_msg_queue_msgsnd(struct kern_ipc_perm *isp, struct msg_msg *msg, int msqflg) { int may; may = smack_flags_to_may(msqflg); return smk_curacc_msq(isp, may); } /** * smack_msg_queue_msgrcv - Smack access check for msg_queue * @isp: the object * @msg: unused * @target: unused * @type: unused * @mode: unused * * Returns 0 if current has read and write access, error code otherwise */ static int smack_msg_queue_msgrcv(struct kern_ipc_perm *isp, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return smk_curacc_msq(isp, MAY_READWRITE); } /** * smack_ipc_permission - Smack access for ipc_permission() * @ipp: the object permissions * @flag: access requested * * Returns 0 if current has read and write access, error code otherwise */ static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag) { struct smack_known **blob = smack_ipc(ipp); struct smack_known *iskp = *blob; int may = smack_flags_to_may(flag); struct smk_audit_info ad; int rc; #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); ad.a.u.ipc_id = ipp->id; #endif rc = smk_curacc(iskp, may, &ad); rc = smk_bu_current("svipc", iskp, may, rc); return rc; } /** * smack_ipc_getlsmprop - Extract smack security data * @ipp: the object permissions * @prop: where result will be saved */ static void smack_ipc_getlsmprop(struct kern_ipc_perm *ipp, struct lsm_prop *prop) { struct smack_known **iskpp = smack_ipc(ipp); prop->smack.skp = *iskpp; } /** * smack_d_instantiate - Make sure the blob is correct on an inode * @opt_dentry: dentry where inode will be attached * @inode: the object * * Set the inode's security blob if it hasn't been done already. */ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) { struct super_block *sbp; struct superblock_smack *sbsp; struct inode_smack *isp; struct smack_known *skp; struct smack_known *ckp = smk_of_current(); struct smack_known *final; char trattr[TRANS_TRUE_SIZE]; int transflag = 0; int rc; struct dentry *dp; if (inode == NULL) return; isp = smack_inode(inode); /* * If the inode is already instantiated * take the quick way out */ if (isp->smk_flags & SMK_INODE_INSTANT) return; sbp = inode->i_sb; sbsp = smack_superblock(sbp); /* * We're going to use the superblock default label * if there's no label on the file. */ final = sbsp->smk_default; /* * If this is the root inode the superblock * may be in the process of initialization. * If that is the case use the root value out * of the superblock. */ if (opt_dentry->d_parent == opt_dentry) { switch (sbp->s_magic) { case CGROUP_SUPER_MAGIC: case CGROUP2_SUPER_MAGIC: /* * The cgroup filesystem is never mounted, * so there's no opportunity to set the mount * options. */ sbsp->smk_root = &smack_known_star; sbsp->smk_default = &smack_known_star; isp->smk_inode = sbsp->smk_root; break; case TMPFS_MAGIC: /* * What about shmem/tmpfs anonymous files with dentry * obtained from d_alloc_pseudo()? */ isp->smk_inode = smk_of_current(); break; case PIPEFS_MAGIC: isp->smk_inode = smk_of_current(); break; case SOCKFS_MAGIC: /* * Socket access is controlled by the socket * structures associated with the task involved. */ isp->smk_inode = &smack_known_star; break; default: isp->smk_inode = sbsp->smk_root; break; } isp->smk_flags |= SMK_INODE_INSTANT; return; } /* * This is pretty hackish. * Casey says that we shouldn't have to do * file system specific code, but it does help * with keeping it simple. */ switch (sbp->s_magic) { case SMACK_MAGIC: case CGROUP_SUPER_MAGIC: case CGROUP2_SUPER_MAGIC: /* * Casey says that it's a little embarrassing * that the smack file system doesn't do * extended attributes. * * Cgroupfs is special */ final = &smack_known_star; break; case DEVPTS_SUPER_MAGIC: /* * devpts seems content with the label of the task. * Programs that change smack have to treat the * pty with respect. */ final = ckp; break; case PROC_SUPER_MAGIC: /* * Casey says procfs appears not to care. * The superblock default suffices. */ break; case TMPFS_MAGIC: /* * Device labels should come from the filesystem, * but watch out, because they're volitile, * getting recreated on every reboot. */ final = &smack_known_star; /* * If a smack value has been set we want to use it, * but since tmpfs isn't giving us the opportunity * to set mount options simulate setting the * superblock default. */ fallthrough; default: /* * This isn't an understood special case. * Get the value from the xattr. */ /* * UNIX domain sockets use lower level socket data. */ if (S_ISSOCK(inode->i_mode)) { final = &smack_known_star; break; } /* * No xattr support means, alas, no SMACK label. * Use the aforeapplied default. * It would be curious if the label of the task * does not match that assigned. */ if (!(inode->i_opflags & IOP_XATTR)) break; /* * Get the dentry for xattr. */ dp = dget(opt_dentry); skp = smk_fetch(XATTR_NAME_SMACK, inode, dp); if (!IS_ERR_OR_NULL(skp)) final = skp; /* * Transmuting directory */ if (S_ISDIR(inode->i_mode)) { /* * If this is a new directory and the label was * transmuted when the inode was initialized * set the transmute attribute on the directory * and mark the inode. * * If there is a transmute attribute on the * directory mark the inode. */ rc = __vfs_getxattr(dp, inode, XATTR_NAME_SMACKTRANSMUTE, trattr, TRANS_TRUE_SIZE); if (rc >= 0 && strncmp(trattr, TRANS_TRUE, TRANS_TRUE_SIZE) != 0) rc = -EINVAL; if (rc >= 0) transflag = SMK_INODE_TRANSMUTE; } /* * Don't let the exec or mmap label be "*" or "@". */ skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp); if (IS_ERR(skp) || skp == &smack_known_star || skp == &smack_known_web) skp = NULL; isp->smk_task = skp; skp = smk_fetch(XATTR_NAME_SMACKMMAP, inode, dp); if (IS_ERR(skp) || skp == &smack_known_star || skp == &smack_known_web) skp = NULL; isp->smk_mmap = skp; dput(dp); break; } if (final == NULL) isp->smk_inode = ckp; else isp->smk_inode = final; isp->smk_flags |= (SMK_INODE_INSTANT | transflag); return; } /** * smack_getselfattr - Smack current process attribute * @attr: which attribute to fetch * @ctx: buffer to receive the result * @size: available size in, actual size out * @flags: unused * * Fill the passed user space @ctx with the details of the requested * attribute. * * Returns the number of attributes on success, an error code otherwise. * There will only ever be one attribute. */ static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, u32 *size, u32 flags) { int rc; struct smack_known *skp; if (attr != LSM_ATTR_CURRENT) return -EOPNOTSUPP; skp = smk_of_current(); rc = lsm_fill_user_ctx(ctx, size, skp->smk_known, strlen(skp->smk_known) + 1, LSM_ID_SMACK, 0); return (!rc ? 1 : rc); } /** * smack_getprocattr - Smack process attribute access * @p: the object task * @name: the name of the attribute in /proc/.../attr * @value: where to put the result * * Places a copy of the task Smack into value * * Returns the length of the smack label or an error code */ static int smack_getprocattr(struct task_struct *p, const char *name, char **value) { struct smack_known *skp = smk_of_task_struct_obj(p); char *cp; int slen; if (strcmp(name, "current") != 0) return -EINVAL; cp = kstrdup(skp->smk_known, GFP_KERNEL); if (cp == NULL) return -ENOMEM; slen = strlen(cp); *value = cp; return slen; } /** * do_setattr - Smack process attribute setting * @attr: the ID of the attribute * @value: the value to set * @size: the size of the value * * Sets the Smack value of the task. Only setting self * is permitted and only with privilege * * Returns the length of the smack label or an error code */ static int do_setattr(u64 attr, void *value, size_t size) { struct task_smack *tsp = smack_cred(current_cred()); struct cred *new; struct smack_known *skp; struct smack_known_list_elem *sklep; int rc; if (!smack_privileged(CAP_MAC_ADMIN) && list_empty(&tsp->smk_relabel)) return -EPERM; if (value == NULL || size == 0 || size >= SMK_LONGLABEL) return -EINVAL; if (attr != LSM_ATTR_CURRENT) return -EOPNOTSUPP; skp = smk_import_entry(value, size); if (IS_ERR(skp)) return PTR_ERR(skp); /* * No process is ever allowed the web ("@") label * and the star ("*") label. */ if (skp == &smack_known_web || skp == &smack_known_star) return -EINVAL; if (!smack_privileged(CAP_MAC_ADMIN)) { rc = -EPERM; list_for_each_entry(sklep, &tsp->smk_relabel, list) if (sklep->smk_label == skp) { rc = 0; break; } if (rc) return rc; } new = prepare_creds(); if (new == NULL) return -ENOMEM; tsp = smack_cred(new); tsp->smk_task = skp; /* * process can change its label only once */ smk_destroy_label_list(&tsp->smk_relabel); commit_creds(new); return size; } /** * smack_setselfattr - Set a Smack process attribute * @attr: which attribute to set * @ctx: buffer containing the data * @size: size of @ctx * @flags: unused * * Fill the passed user space @ctx with the details of the requested * attribute. * * Returns 0 on success, an error code otherwise. */ static int smack_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } /** * smack_setprocattr - Smack process attribute setting * @name: the name of the attribute in /proc/.../attr * @value: the value to set * @size: the size of the value * * Sets the Smack value of the task. Only setting self * is permitted and only with privilege * * Returns the length of the smack label or an error code */ static int smack_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr != LSM_ATTR_UNDEF) return do_setattr(attr, value, size); return -EINVAL; } /** * smack_unix_stream_connect - Smack access on UDS * @sock: one sock * @other: the other sock * @newsk: unused * * Return 0 if a subject with the smack of sock could access * an object with the smack of other, otherwise an error code */ static int smack_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { struct smack_known *skp; struct smack_known *okp; struct socket_smack *ssp = smack_sock(sock); struct socket_smack *osp = smack_sock(other); struct socket_smack *nsp = smack_sock(newsk); struct smk_audit_info ad; int rc = 0; #ifdef CONFIG_AUDIT struct lsm_network_audit net; #endif if (!smack_privileged(CAP_MAC_OVERRIDE)) { skp = ssp->smk_out; okp = osp->smk_in; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); smk_ad_setfield_u_net_sk(&ad, other); #endif rc = smk_access(skp, okp, MAY_WRITE, &ad); rc = smk_bu_note("UDS connect", skp, okp, MAY_WRITE, rc); if (rc == 0) { okp = osp->smk_out; skp = ssp->smk_in; rc = smk_access(okp, skp, MAY_WRITE, &ad); rc = smk_bu_note("UDS connect", okp, skp, MAY_WRITE, rc); } } if (rc == 0) { /* * Cross reference the peer labels for SO_PEERSEC. */ nsp->smk_packet = ssp->smk_out; ssp->smk_packet = osp->smk_out; /* * new/child/established socket must inherit listening socket labels */ nsp->smk_out = osp->smk_out; nsp->smk_in = osp->smk_in; } return rc; } /** * smack_unix_may_send - Smack access on UDS * @sock: one socket * @other: the other socket * * Return 0 if a subject with the smack of sock could access * an object with the smack of other, otherwise an error code */ static int smack_unix_may_send(struct socket *sock, struct socket *other) { struct socket_smack *ssp = smack_sock(sock->sk); struct socket_smack *osp = smack_sock(other->sk); struct smk_audit_info ad; int rc; #ifdef CONFIG_AUDIT struct lsm_network_audit net; smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); smk_ad_setfield_u_net_sk(&ad, other->sk); #endif if (smack_privileged(CAP_MAC_OVERRIDE)) return 0; rc = smk_access(ssp->smk_out, osp->smk_in, MAY_WRITE, &ad); rc = smk_bu_note("UDS send", ssp->smk_out, osp->smk_in, MAY_WRITE, rc); return rc; } /** * smack_socket_sendmsg - Smack check based on destination host * @sock: the socket * @msg: the message * @size: the size of the message * * Return 0 if the current subject can write to the destination host. * For IPv4 this is only a question if the destination is a single label host. * For IPv6 this is a check against the label of the port. */ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { struct sockaddr_in *sip = (struct sockaddr_in *) msg->msg_name; #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sap = (struct sockaddr_in6 *) msg->msg_name; #endif #ifdef SMACK_IPV6_SECMARK_LABELING struct socket_smack *ssp = smack_sock(sock->sk); struct smack_known *rsp; #endif int rc = 0; /* * Perfectly reasonable for this to be NULL */ if (sip == NULL) return 0; switch (sock->sk->sk_family) { case AF_INET: if (msg->msg_namelen < sizeof(struct sockaddr_in) || sip->sin_family != AF_INET) return -EINVAL; rc = smk_ipv4_check(sock->sk, sip); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (msg->msg_namelen < SIN6_LEN_RFC2133 || sap->sin6_family != AF_INET6) return -EINVAL; #ifdef SMACK_IPV6_SECMARK_LABELING rsp = smack_ipv6host_label(sap); if (rsp != NULL) rc = smk_ipv6_check(ssp->smk_out, rsp, sap, SMK_CONNECTING); #endif #ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sock->sk, sap, SMK_SENDING); #endif #endif /* IS_ENABLED(CONFIG_IPV6) */ break; } return rc; } /** * smack_from_secattr - Convert a netlabel attr.mls.lvl/attr.mls.cat pair to smack * @sap: netlabel secattr * @ssp: socket security information * * Returns a pointer to a Smack label entry found on the label list. */ static struct smack_known *smack_from_secattr(struct netlbl_lsm_secattr *sap, struct socket_smack *ssp) { struct smack_known *skp; int found = 0; int acat; int kcat; /* * Netlabel found it in the cache. */ if ((sap->flags & NETLBL_SECATTR_CACHE) != 0) return (struct smack_known *)sap->cache->data; if ((sap->flags & NETLBL_SECATTR_SECID) != 0) /* * Looks like a fallback, which gives us a secid. */ return smack_from_secid(sap->attr.secid); if ((sap->flags & NETLBL_SECATTR_MLS_LVL) != 0) { /* * Looks like a CIPSO packet. * If there are flags but no level netlabel isn't * behaving the way we expect it to. * * Look it up in the label table * Without guidance regarding the smack value * for the packet fall back on the network * ambient value. */ rcu_read_lock(); list_for_each_entry_rcu(skp, &smack_known_list, list) { if (sap->attr.mls.lvl != skp->smk_netlabel.attr.mls.lvl) continue; /* * Compare the catsets. Use the netlbl APIs. */ if ((sap->flags & NETLBL_SECATTR_MLS_CAT) == 0) { if ((skp->smk_netlabel.flags & NETLBL_SECATTR_MLS_CAT) == 0) found = 1; break; } for (acat = -1, kcat = -1; acat == kcat; ) { acat = netlbl_catmap_walk(sap->attr.mls.cat, acat + 1); kcat = netlbl_catmap_walk( skp->smk_netlabel.attr.mls.cat, kcat + 1); if (acat < 0 || kcat < 0) break; } if (acat == kcat) { found = 1; break; } } rcu_read_unlock(); if (found) return skp; if (ssp != NULL && ssp->smk_in == &smack_known_star) return &smack_known_web; return &smack_known_star; } /* * Without guidance regarding the smack value * for the packet fall back on the network * ambient value. */ return smack_net_ambient; } #if IS_ENABLED(CONFIG_IPV6) static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) { u8 nexthdr; int offset; int proto = -EINVAL; struct ipv6hdr _ipv6h; struct ipv6hdr *ip6; __be16 frag_off; struct tcphdr _tcph, *th; struct udphdr _udph, *uh; struct dccp_hdr _dccph, *dh; sip->sin6_port = 0; offset = skb_network_offset(skb); ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (ip6 == NULL) return -EINVAL; sip->sin6_addr = ip6->saddr; nexthdr = ip6->nexthdr; offset += sizeof(_ipv6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return -EINVAL; proto = nexthdr; switch (proto) { case IPPROTO_TCP: th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th != NULL) sip->sin6_port = th->source; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh != NULL) sip->sin6_port = uh->source; break; case IPPROTO_DCCP: dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh != NULL) sip->sin6_port = dh->dccph_sport; break; } return proto; } #endif /* CONFIG_IPV6 */ /** * smack_from_skb - Smack data from the secmark in an skb * @skb: packet * * Returns smack_known of the secmark or NULL if that won't work. */ #ifdef CONFIG_NETWORK_SECMARK static struct smack_known *smack_from_skb(struct sk_buff *skb) { if (skb == NULL || skb->secmark == 0) return NULL; return smack_from_secid(skb->secmark); } #else static inline struct smack_known *smack_from_skb(struct sk_buff *skb) { return NULL; } #endif /** * smack_from_netlbl - Smack data from the IP options in an skb * @sk: socket data came in on * @family: address family * @skb: packet * * Find the Smack label in the IP options. If it hasn't been * added to the netlabel cache, add it here. * * Returns smack_known of the IP options or NULL if that won't work. */ static struct smack_known *smack_from_netlbl(const struct sock *sk, u16 family, struct sk_buff *skb) { struct netlbl_lsm_secattr secattr; struct socket_smack *ssp = NULL; struct smack_known *skp = NULL; netlbl_secattr_init(&secattr); if (sk) ssp = smack_sock(sk); if (netlbl_skbuff_getattr(skb, family, &secattr) == 0) { skp = smack_from_secattr(&secattr, ssp); if (secattr.flags & NETLBL_SECATTR_CACHEABLE) netlbl_cache_add(skb, family, &skp->smk_netlabel); } netlbl_secattr_destroy(&secattr); return skp; } /** * smack_socket_sock_rcv_skb - Smack packet delivery access check * @sk: socket * @skb: packet * * Returns 0 if the packet should be delivered, an error code otherwise */ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct socket_smack *ssp = smack_sock(sk); struct smack_known *skp = NULL; int rc = 0; struct smk_audit_info ad; u16 family = sk->sk_family; #ifdef CONFIG_AUDIT struct lsm_network_audit net; #endif #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 sadd; int proto; if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; #endif /* CONFIG_IPV6 */ switch (family) { case PF_INET: /* * If there is a secmark use it rather than the CIPSO label. * If there is no secmark fall back to CIPSO. * The secmark is assumed to reflect policy better. */ skp = smack_from_skb(skb); if (skp == NULL) { skp = smack_from_netlbl(sk, family, skb); if (skp == NULL) skp = smack_net_ambient; } #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; ad.a.u.net->netif = skb->skb_iif; ipv4_skb_to_auditdata(skb, &ad.a, NULL); #endif /* * Receiving a packet requires that the other end * be able to write here. Read access is not required. * This is the simplist possible security model * for networking. */ rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad); rc = smk_bu_note("IPv4 delivery", skp, ssp->smk_in, MAY_WRITE, rc); if (rc != 0) netlbl_skbuff_err(skb, family, rc, 0); break; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: proto = smk_skb_to_addr_ipv6(skb, &sadd); if (proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && proto != IPPROTO_TCP && proto != IPPROTO_DCCP) break; #ifdef SMACK_IPV6_SECMARK_LABELING skp = smack_from_skb(skb); if (skp == NULL) { if (smk_ipv6_localhost(&sadd)) break; skp = smack_ipv6host_label(&sadd); if (skp == NULL) skp = smack_net_ambient; } #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; ad.a.u.net->netif = skb->skb_iif; ipv6_skb_to_auditdata(skb, &ad.a, NULL); #endif /* CONFIG_AUDIT */ rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad); rc = smk_bu_note("IPv6 delivery", skp, ssp->smk_in, MAY_WRITE, rc); #endif /* SMACK_IPV6_SECMARK_LABELING */ #ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sk, &sadd, SMK_RECEIVING); #endif /* SMACK_IPV6_PORT_LABELING */ if (rc != 0) icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADM_PROHIBITED, 0); break; #endif /* CONFIG_IPV6 */ } return rc; } /** * smack_socket_getpeersec_stream - pull in packet label * @sock: the socket * @optval: user's destination * @optlen: size thereof * @len: max thereof * * returns zero on success, an error code otherwise */ static int smack_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { struct socket_smack *ssp; char *rcp = ""; u32 slen = 1; int rc = 0; ssp = smack_sock(sock->sk); if (ssp->smk_packet != NULL) { rcp = ssp->smk_packet->smk_known; slen = strlen(rcp) + 1; } if (slen > len) { rc = -ERANGE; goto out_len; } if (copy_to_sockptr(optval, rcp, slen)) rc = -EFAULT; out_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) rc = -EFAULT; return rc; } /** * smack_socket_getpeersec_dgram - pull in packet label * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int smack_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { struct socket_smack *ssp = NULL; struct smack_known *skp; struct sock *sk = NULL; int family = PF_UNSPEC; u32 s = 0; /* 0 is the invalid secid */ if (skb != NULL) { if (skb->protocol == htons(ETH_P_IP)) family = PF_INET; #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) family = PF_INET6; #endif /* CONFIG_IPV6 */ } if (family == PF_UNSPEC && sock != NULL) family = sock->sk->sk_family; switch (family) { case PF_UNIX: ssp = smack_sock(sock->sk); s = ssp->smk_out->smk_secid; break; case PF_INET: skp = smack_from_skb(skb); if (skp) { s = skp->smk_secid; break; } /* * Translate what netlabel gave us. */ if (sock != NULL) sk = sock->sk; skp = smack_from_netlbl(sk, family, skb); if (skp != NULL) s = skp->smk_secid; break; case PF_INET6: #ifdef SMACK_IPV6_SECMARK_LABELING skp = smack_from_skb(skb); if (skp) s = skp->smk_secid; #endif break; } *secid = s; if (s == 0) return -EINVAL; return 0; } /** * smack_sock_graft - Initialize a newly created socket with an existing sock * @sk: child sock * @parent: parent socket * * Set the smk_{in,out} state of an existing sock based on the process that * is creating the new socket. */ static void smack_sock_graft(struct sock *sk, struct socket *parent) { struct socket_smack *ssp; struct smack_known *skp = smk_of_current(); if (sk == NULL || (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)) return; ssp = smack_sock(sk); ssp->smk_in = skp; ssp->smk_out = skp; /* cssp->smk_packet is already set in smack_inet_csk_clone() */ } /** * smack_inet_conn_request - Smack access check on connect * @sk: socket involved * @skb: packet * @req: unused * * Returns 0 if a task with the packet label could write to * the socket, otherwise an error code */ static int smack_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { u16 family = sk->sk_family; struct smack_known *skp; struct socket_smack *ssp = smack_sock(sk); struct sockaddr_in addr; struct iphdr *hdr; struct smack_known *hskp; int rc; struct smk_audit_info ad; #ifdef CONFIG_AUDIT struct lsm_network_audit net; #endif #if IS_ENABLED(CONFIG_IPV6) if (family == PF_INET6) { /* * Handle mapped IPv4 packets arriving * via IPv6 sockets. Don't set up netlabel * processing on IPv6. */ if (skb->protocol == htons(ETH_P_IP)) family = PF_INET; else return 0; } #endif /* CONFIG_IPV6 */ /* * If there is a secmark use it rather than the CIPSO label. * If there is no secmark fall back to CIPSO. * The secmark is assumed to reflect policy better. */ skp = smack_from_skb(skb); if (skp == NULL) { skp = smack_from_netlbl(sk, family, skb); if (skp == NULL) skp = &smack_known_huh; } #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; ad.a.u.net->netif = skb->skb_iif; ipv4_skb_to_auditdata(skb, &ad.a, NULL); #endif /* * Receiving a packet requires that the other end be able to write * here. Read access is not required. */ rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad); rc = smk_bu_note("IPv4 connect", skp, ssp->smk_in, MAY_WRITE, rc); if (rc != 0) return rc; /* * Save the peer's label in the request_sock so we can later setup * smk_packet in the child socket so that SO_PEERCRED can report it. */ req->peer_secid = skp->smk_secid; /* * We need to decide if we want to label the incoming connection here * if we do we only need to label the request_sock and the stack will * propagate the wire-label to the sock when it is created. */ hdr = ip_hdr(skb); addr.sin_addr.s_addr = hdr->saddr; rcu_read_lock(); hskp = smack_ipv4host_label(&addr); rcu_read_unlock(); if (hskp == NULL) rc = netlbl_req_setattr(req, &ssp->smk_out->smk_netlabel); else netlbl_req_delattr(req); return rc; } /** * smack_inet_csk_clone - Copy the connection information to the new socket * @sk: the new socket * @req: the connection's request_sock * * Transfer the connection's peer label to the newly created socket. */ static void smack_inet_csk_clone(struct sock *sk, const struct request_sock *req) { struct socket_smack *ssp = smack_sock(sk); struct smack_known *skp; if (req->peer_secid != 0) { skp = smack_from_secid(req->peer_secid); ssp->smk_packet = skp; } else ssp->smk_packet = NULL; } /* * Key management security hooks * * Casey has not tested key support very heavily. * The permission check is most likely too restrictive. * If you care about keys please have a look. */ #ifdef CONFIG_KEYS /** * smack_key_alloc - Set the key security blob * @key: object * @cred: the credentials to use * @flags: unused * * No allocation required * * Returns 0 */ static int smack_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { struct smack_known **blob = smack_key(key); struct smack_known *skp = smk_of_task(smack_cred(cred)); *blob = skp; return 0; } /** * smack_key_permission - Smack access on a key * @key_ref: gets to the object * @cred: the credentials to use * @need_perm: requested key permission * * Return 0 if the task has read and write to the object, * an error code otherwise */ static int smack_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct smack_known **blob; struct smack_known *skp; struct key *keyp; struct smk_audit_info ad; struct smack_known *tkp = smk_of_task(smack_cred(cred)); int request = 0; int rc; /* * Validate requested permissions */ switch (need_perm) { case KEY_NEED_READ: case KEY_NEED_SEARCH: case KEY_NEED_VIEW: request |= MAY_READ; break; case KEY_NEED_WRITE: case KEY_NEED_LINK: case KEY_NEED_SETATTR: request |= MAY_WRITE; break; case KEY_NEED_UNSPECIFIED: case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: return 0; default: return -EINVAL; } keyp = key_ref_to_ptr(key_ref); if (keyp == NULL) return -EINVAL; /* * If the key hasn't been initialized give it access so that * it may do so. */ blob = smack_key(keyp); skp = *blob; if (skp == NULL) return 0; /* * This should not occur */ if (tkp == NULL) return -EACCES; if (smack_privileged(CAP_MAC_OVERRIDE)) return 0; #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY); ad.a.u.key_struct.key = keyp->serial; ad.a.u.key_struct.key_desc = keyp->description; #endif rc = smk_access(tkp, skp, request, &ad); rc = smk_bu_note("key access", tkp, skp, request, rc); return rc; } /* * smack_key_getsecurity - Smack label tagging the key * @key points to the key to be queried * @_buffer points to a pointer that should be set to point to the * resulting string (if no label or an error occurs). * Return the length of the string (including terminating NUL) or -ve if * an error. * May also return 0 (and a NULL buffer pointer) if there is no label. */ static int smack_key_getsecurity(struct key *key, char **_buffer) { struct smack_known **blob = smack_key(key); struct smack_known *skp = *blob; size_t length; char *copy; if (skp == NULL) { *_buffer = NULL; return 0; } copy = kstrdup(skp->smk_known, GFP_KERNEL); if (copy == NULL) return -ENOMEM; length = strlen(copy) + 1; *_buffer = copy; return length; } #ifdef CONFIG_KEY_NOTIFICATIONS /** * smack_watch_key - Smack access to watch a key for notifications. * @key: The key to be watched * * Return 0 if the @watch->cred has permission to read from the key object and * an error otherwise. */ static int smack_watch_key(struct key *key) { struct smk_audit_info ad; struct smack_known *tkp = smk_of_current(); struct smack_known **blob = smack_key(key); int rc; /* * This should not occur */ if (tkp == NULL) return -EACCES; if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred())) return 0; #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY); ad.a.u.key_struct.key = key->serial; ad.a.u.key_struct.key_desc = key->description; #endif rc = smk_access(tkp, *blob, MAY_READ, &ad); rc = smk_bu_note("key watch", tkp, *blob, MAY_READ, rc); return rc; } #endif /* CONFIG_KEY_NOTIFICATIONS */ #endif /* CONFIG_KEYS */ #ifdef CONFIG_WATCH_QUEUE /** * smack_post_notification - Smack access to post a notification to a queue * @w_cred: The credentials of the watcher. * @cred: The credentials of the event source (may be NULL). * @n: The notification message to be posted. */ static int smack_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) { struct smk_audit_info ad; struct smack_known *subj, *obj; int rc; /* Always let maintenance notifications through. */ if (n->type == WATCH_TYPE_META) return 0; if (!cred) return 0; subj = smk_of_task(smack_cred(cred)); obj = smk_of_task(smack_cred(w_cred)); smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION); rc = smk_access(subj, obj, MAY_WRITE, &ad); rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc); return rc; } #endif /* CONFIG_WATCH_QUEUE */ /* * Smack Audit hooks * * Audit requires a unique representation of each Smack specific * rule. This unique representation is used to distinguish the * object to be audited from remaining kernel objects and also * works as a glue between the audit hooks. * * Since repository entries are added but never deleted, we'll use * the smack_known label address related to the given audit rule as * the needed unique representation. This also better fits the smack * model where nearly everything is a label. */ #ifdef CONFIG_AUDIT /** * smack_audit_rule_init - Initialize a smack audit rule * @field: audit rule fields given from user-space (audit.h) * @op: required testing operator (=, !=, >, <, ...) * @rulestr: smack label to be audited * @vrule: pointer to save our own audit rule representation * @gfp: type of the memory for the allocation * * Prepare to audit cases where (@field @op @rulestr) is true. * The label to be audited is created if necessay. */ static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct smack_known *skp; char **rule = (char **)vrule; *rule = NULL; if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER) return -EINVAL; if (op != Audit_equal && op != Audit_not_equal) return -EINVAL; skp = smk_import_entry(rulestr, 0); if (IS_ERR(skp)) return PTR_ERR(skp); *rule = skp->smk_known; return 0; } /** * smack_audit_rule_known - Distinguish Smack audit rules * @krule: rule of interest, in Audit kernel representation format * * This is used to filter Smack rules from remaining Audit ones. * If it's proved that this rule belongs to us, the * audit_rule_match hook will be called to do the final judgement. */ static int smack_audit_rule_known(struct audit_krule *krule) { struct audit_field *f; int i; for (i = 0; i < krule->field_count; i++) { f = &krule->fields[i]; if (f->type == AUDIT_SUBJ_USER || f->type == AUDIT_OBJ_USER) return 1; } return 0; } /** * smack_audit_rule_match - Audit given object ? * @prop: security id for identifying the object to test * @field: audit rule flags given from user-space * @op: required testing operator * @vrule: smack internal rule presentation * * The core Audit hook. It's used to take the decision of * whether to audit or not to audit a given object. */ static int smack_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule) { struct smack_known *skp = prop->smack.skp; char *rule = vrule; if (unlikely(!rule)) { WARN_ONCE(1, "Smack: missing rule\n"); return -ENOENT; } if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER) return 0; /* * No need to do string comparisons. If a match occurs, * both pointers will point to the same smack_known * label. */ if (op == Audit_equal) return (rule == skp->smk_known); if (op == Audit_not_equal) return (rule != skp->smk_known); return 0; } /* * There is no need for a smack_audit_rule_free hook. * No memory was allocated. */ #endif /* CONFIG_AUDIT */ /** * smack_ismaclabel - check if xattr @name references a smack MAC label * @name: Full xattr name to check. */ static int smack_ismaclabel(const char *name) { return (strcmp(name, XATTR_SMACK_SUFFIX) == 0); } /** * smack_to_secctx - fill a lsm_context * @skp: Smack label * @cp: destination * * Fill the passed @cp and return the length of the string */ static int smack_to_secctx(struct smack_known *skp, struct lsm_context *cp) { int len = strlen(skp->smk_known); if (cp) { cp->context = skp->smk_known; cp->len = len; cp->id = LSM_ID_SMACK; } return len; } /** * smack_secid_to_secctx - return the smack label for a secid * @secid: incoming integer * @cp: destination * * Exists for networking code. */ static int smack_secid_to_secctx(u32 secid, struct lsm_context *cp) { return smack_to_secctx(smack_from_secid(secid), cp); } /** * smack_lsmprop_to_secctx - return the smack label * @prop: includes incoming Smack data * @cp: destination * * Exists for audit code. */ static int smack_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) { return smack_to_secctx(prop->smack.skp, cp); } /** * smack_secctx_to_secid - return the secid for a smack label * @secdata: smack label * @seclen: how long result is * @secid: outgoing integer * * Exists for audit and networking code. */ static int smack_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { struct smack_known *skp = smk_find_entry(secdata); if (skp) *secid = skp->smk_secid; else *secid = 0; return 0; } /* * There used to be a smack_release_secctx hook * that did nothing back when hooks were in a vector. * Now that there's a list such a hook adds cost. */ static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx, ctxlen, 0); } static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return __vfs_setxattr_locked(&nop_mnt_idmap, dentry, XATTR_NAME_SMACK, ctx, ctxlen, 0, NULL); } static int smack_inode_getsecctx(struct inode *inode, struct lsm_context *cp) { struct smack_known *skp = smk_of_inode(inode); cp->context = skp->smk_known; cp->len = strlen(skp->smk_known); cp->id = LSM_ID_SMACK; return 0; } static int smack_inode_copy_up(struct dentry *dentry, struct cred **new) { struct task_smack *tsp; struct smack_known *skp; struct inode_smack *isp; struct cred *new_creds = *new; if (new_creds == NULL) { new_creds = prepare_creds(); if (new_creds == NULL) return -ENOMEM; } tsp = smack_cred(new_creds); /* * Get label from overlay inode and set it in create_sid */ isp = smack_inode(d_inode(dentry)); skp = isp->smk_inode; tsp->smk_task = skp; *new = new_creds; return 0; } static int smack_inode_copy_up_xattr(struct dentry *src, const char *name) { /* * Return -ECANCELED if this is the smack access Smack attribute. */ if (!strcmp(name, XATTR_NAME_SMACK)) return -ECANCELED; return -EOPNOTSUPP; } static int smack_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { struct task_smack *otsp = smack_cred(old); struct task_smack *ntsp = smack_cred(new); struct inode_smack *isp; int may; /* * Use the process credential unless all of * the transmuting criteria are met */ ntsp->smk_task = otsp->smk_task; /* * the attribute of the containing directory */ isp = smack_inode(d_inode(dentry->d_parent)); if (isp->smk_flags & SMK_INODE_TRANSMUTE) { rcu_read_lock(); may = smk_access_entry(otsp->smk_task->smk_known, isp->smk_inode->smk_known, &otsp->smk_task->smk_rules); rcu_read_unlock(); /* * If the directory is transmuting and the rule * providing access is transmuting use the containing * directory label instead of the process label. */ if (may > 0 && (may & MAY_TRANSMUTE)) { ntsp->smk_task = isp->smk_inode; ntsp->smk_transmuted = ntsp->smk_task; } } return 0; } #ifdef CONFIG_IO_URING /** * smack_uring_override_creds - Is io_uring cred override allowed? * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int smack_uring_override_creds(const struct cred *new) { struct task_smack *tsp = smack_cred(current_cred()); struct task_smack *nsp = smack_cred(new); /* * Allow the degenerate case where the new Smack value is * the same as the current Smack value. */ if (tsp->smk_task == nsp->smk_task) return 0; if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred())) return 0; return -EPERM; } /** * smack_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int smack_uring_sqpoll(void) { if (smack_privileged_cred(CAP_MAC_ADMIN, current_cred())) return 0; return -EPERM; } /** * smack_uring_cmd - check on file operations for io_uring * @ioucmd: the command in question * * Make a best guess about whether a io_uring "command" should * be allowed. Use the same logic used for determining if the * file could be opened for read in the absence of better criteria. */ static int smack_uring_cmd(struct io_uring_cmd *ioucmd) { struct file *file = ioucmd->file; struct smk_audit_info ad; struct task_smack *tsp; struct inode *inode; int rc; if (!file) return -EINVAL; tsp = smack_cred(file->f_cred); inode = file_inode(file); smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); rc = smk_tskacc(tsp, smk_of_inode(inode), MAY_READ, &ad); rc = smk_bu_credfile(file->f_cred, file, MAY_READ, rc); return rc; } #endif /* CONFIG_IO_URING */ struct lsm_blob_sizes smack_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct task_smack), .lbs_file = sizeof(struct smack_known *), .lbs_inode = sizeof(struct inode_smack), .lbs_ipc = sizeof(struct smack_known *), .lbs_key = sizeof(struct smack_known *), .lbs_msg_msg = sizeof(struct smack_known *), .lbs_sock = sizeof(struct socket_smack), .lbs_superblock = sizeof(struct superblock_smack), .lbs_xattr_count = SMACK_INODE_INIT_XATTRS, }; static const struct lsm_id smack_lsmid = { .name = "smack", .id = LSM_ID_SMACK, }; static struct security_hook_list smack_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme), LSM_HOOK_INIT(syslog, smack_syslog), LSM_HOOK_INIT(fs_context_submount, smack_fs_context_submount), LSM_HOOK_INIT(fs_context_dup, smack_fs_context_dup), LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param), LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security), LSM_HOOK_INIT(sb_free_mnt_opts, smack_free_mnt_opts), LSM_HOOK_INIT(sb_eat_lsm_opts, smack_sb_eat_lsm_opts), LSM_HOOK_INIT(sb_statfs, smack_sb_statfs), LSM_HOOK_INIT(sb_set_mnt_opts, smack_set_mnt_opts), LSM_HOOK_INIT(bprm_creds_for_exec, smack_bprm_creds_for_exec), LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security), LSM_HOOK_INIT(inode_init_security, smack_inode_init_security), LSM_HOOK_INIT(inode_link, smack_inode_link), LSM_HOOK_INIT(inode_unlink, smack_inode_unlink), LSM_HOOK_INIT(inode_rmdir, smack_inode_rmdir), LSM_HOOK_INIT(inode_rename, smack_inode_rename), LSM_HOOK_INIT(inode_permission, smack_inode_permission), LSM_HOOK_INIT(inode_setattr, smack_inode_setattr), LSM_HOOK_INIT(inode_getattr, smack_inode_getattr), LSM_HOOK_INIT(inode_xattr_skipcap, smack_inode_xattr_skipcap), LSM_HOOK_INIT(inode_setxattr, smack_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, smack_inode_post_setxattr), LSM_HOOK_INIT(inode_getxattr, smack_inode_getxattr), LSM_HOOK_INIT(inode_removexattr, smack_inode_removexattr), LSM_HOOK_INIT(inode_set_acl, smack_inode_set_acl), LSM_HOOK_INIT(inode_get_acl, smack_inode_get_acl), LSM_HOOK_INIT(inode_remove_acl, smack_inode_remove_acl), LSM_HOOK_INIT(inode_getsecurity, smack_inode_getsecurity), LSM_HOOK_INIT(inode_setsecurity, smack_inode_setsecurity), LSM_HOOK_INIT(inode_listsecurity, smack_inode_listsecurity), LSM_HOOK_INIT(inode_getlsmprop, smack_inode_getlsmprop), LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security), LSM_HOOK_INIT(file_ioctl, smack_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl), LSM_HOOK_INIT(file_lock, smack_file_lock), LSM_HOOK_INIT(file_fcntl, smack_file_fcntl), LSM_HOOK_INIT(mmap_file, smack_mmap_file), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(file_set_fowner, smack_file_set_fowner), LSM_HOOK_INIT(file_send_sigiotask, smack_file_send_sigiotask), LSM_HOOK_INIT(file_receive, smack_file_receive), LSM_HOOK_INIT(file_open, smack_file_open), LSM_HOOK_INIT(cred_alloc_blank, smack_cred_alloc_blank), LSM_HOOK_INIT(cred_free, smack_cred_free), LSM_HOOK_INIT(cred_prepare, smack_cred_prepare), LSM_HOOK_INIT(cred_transfer, smack_cred_transfer), LSM_HOOK_INIT(cred_getsecid, smack_cred_getsecid), LSM_HOOK_INIT(cred_getlsmprop, smack_cred_getlsmprop), LSM_HOOK_INIT(kernel_act_as, smack_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, smack_kernel_create_files_as), LSM_HOOK_INIT(task_setpgid, smack_task_setpgid), LSM_HOOK_INIT(task_getpgid, smack_task_getpgid), LSM_HOOK_INIT(task_getsid, smack_task_getsid), LSM_HOOK_INIT(current_getlsmprop_subj, smack_current_getlsmprop_subj), LSM_HOOK_INIT(task_getlsmprop_obj, smack_task_getlsmprop_obj), LSM_HOOK_INIT(task_setnice, smack_task_setnice), LSM_HOOK_INIT(task_setioprio, smack_task_setioprio), LSM_HOOK_INIT(task_getioprio, smack_task_getioprio), LSM_HOOK_INIT(task_setscheduler, smack_task_setscheduler), LSM_HOOK_INIT(task_getscheduler, smack_task_getscheduler), LSM_HOOK_INIT(task_movememory, smack_task_movememory), LSM_HOOK_INIT(task_kill, smack_task_kill), LSM_HOOK_INIT(task_to_inode, smack_task_to_inode), LSM_HOOK_INIT(ipc_permission, smack_ipc_permission), LSM_HOOK_INIT(ipc_getlsmprop, smack_ipc_getlsmprop), LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security), LSM_HOOK_INIT(msg_queue_alloc_security, smack_ipc_alloc_security), LSM_HOOK_INIT(msg_queue_associate, smack_msg_queue_associate), LSM_HOOK_INIT(msg_queue_msgctl, smack_msg_queue_msgctl), LSM_HOOK_INIT(msg_queue_msgsnd, smack_msg_queue_msgsnd), LSM_HOOK_INIT(msg_queue_msgrcv, smack_msg_queue_msgrcv), LSM_HOOK_INIT(shm_alloc_security, smack_ipc_alloc_security), LSM_HOOK_INIT(shm_associate, smack_shm_associate), LSM_HOOK_INIT(shm_shmctl, smack_shm_shmctl), LSM_HOOK_INIT(shm_shmat, smack_shm_shmat), LSM_HOOK_INIT(sem_alloc_security, smack_ipc_alloc_security), LSM_HOOK_INIT(sem_associate, smack_sem_associate), LSM_HOOK_INIT(sem_semctl, smack_sem_semctl), LSM_HOOK_INIT(sem_semop, smack_sem_semop), LSM_HOOK_INIT(d_instantiate, smack_d_instantiate), LSM_HOOK_INIT(getselfattr, smack_getselfattr), LSM_HOOK_INIT(setselfattr, smack_setselfattr), LSM_HOOK_INIT(getprocattr, smack_getprocattr), LSM_HOOK_INIT(setprocattr, smack_setprocattr), LSM_HOOK_INIT(unix_stream_connect, smack_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, smack_unix_may_send), LSM_HOOK_INIT(socket_post_create, smack_socket_post_create), LSM_HOOK_INIT(socket_socketpair, smack_socket_socketpair), #ifdef SMACK_IPV6_PORT_LABELING LSM_HOOK_INIT(socket_bind, smack_socket_bind), #endif LSM_HOOK_INIT(socket_connect, smack_socket_connect), LSM_HOOK_INIT(socket_sendmsg, smack_socket_sendmsg), LSM_HOOK_INIT(socket_sock_rcv_skb, smack_socket_sock_rcv_skb), LSM_HOOK_INIT(socket_getpeersec_stream, smack_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, smack_socket_getpeersec_dgram), LSM_HOOK_INIT(sk_alloc_security, smack_sk_alloc_security), #ifdef SMACK_IPV6_PORT_LABELING LSM_HOOK_INIT(sk_free_security, smack_sk_free_security), #endif LSM_HOOK_INIT(sk_clone_security, smack_sk_clone_security), LSM_HOOK_INIT(sock_graft, smack_sock_graft), LSM_HOOK_INIT(inet_conn_request, smack_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, smack_inet_csk_clone), /* key management security hooks */ #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_alloc, smack_key_alloc), LSM_HOOK_INIT(key_permission, smack_key_permission), LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity), #ifdef CONFIG_KEY_NOTIFICATIONS LSM_HOOK_INIT(watch_key, smack_watch_key), #endif #endif /* CONFIG_KEYS */ #ifdef CONFIG_WATCH_QUEUE LSM_HOOK_INIT(post_notification, smack_post_notification), #endif /* Audit hooks */ #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, smack_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, smack_audit_rule_match), #endif /* CONFIG_AUDIT */ LSM_HOOK_INIT(ismaclabel, smack_ismaclabel), LSM_HOOK_INIT(secid_to_secctx, smack_secid_to_secctx), LSM_HOOK_INIT(lsmprop_to_secctx, smack_lsmprop_to_secctx), LSM_HOOK_INIT(secctx_to_secid, smack_secctx_to_secid), LSM_HOOK_INIT(inode_notifysecctx, smack_inode_notifysecctx), LSM_HOOK_INIT(inode_setsecctx, smack_inode_setsecctx), LSM_HOOK_INIT(inode_getsecctx, smack_inode_getsecctx), LSM_HOOK_INIT(inode_copy_up, smack_inode_copy_up), LSM_HOOK_INIT(inode_copy_up_xattr, smack_inode_copy_up_xattr), LSM_HOOK_INIT(dentry_create_files_as, smack_dentry_create_files_as), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, smack_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, smack_uring_sqpoll), LSM_HOOK_INIT(uring_cmd, smack_uring_cmd), #endif }; static __init void init_smack_known_list(void) { /* * Initialize rule list locks */ mutex_init(&smack_known_huh.smk_rules_lock); mutex_init(&smack_known_hat.smk_rules_lock); mutex_init(&smack_known_floor.smk_rules_lock); mutex_init(&smack_known_star.smk_rules_lock); mutex_init(&smack_known_web.smk_rules_lock); /* * Initialize rule lists */ INIT_LIST_HEAD(&smack_known_huh.smk_rules); INIT_LIST_HEAD(&smack_known_hat.smk_rules); INIT_LIST_HEAD(&smack_known_star.smk_rules); INIT_LIST_HEAD(&smack_known_floor.smk_rules); INIT_LIST_HEAD(&smack_known_web.smk_rules); /* * Create the known labels list */ smk_insert_entry(&smack_known_huh); smk_insert_entry(&smack_known_hat); smk_insert_entry(&smack_known_star); smk_insert_entry(&smack_known_floor); smk_insert_entry(&smack_known_web); } /** * smack_init - initialize the smack system * * Returns 0 on success, -ENOMEM is there's no memory */ static __init int smack_init(void) { struct cred *cred = (struct cred *) current->cred; struct task_smack *tsp; smack_rule_cache = KMEM_CACHE(smack_rule, 0); if (!smack_rule_cache) return -ENOMEM; /* * Set the security state for the initial task. */ tsp = smack_cred(cred); init_task_smack(tsp, &smack_known_floor, &smack_known_floor); /* * Register with LSM */ security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid); smack_enabled = 1; pr_info("Smack: Initializing.\n"); #ifdef CONFIG_SECURITY_SMACK_NETFILTER pr_info("Smack: Netfilter enabled.\n"); #endif #ifdef SMACK_IPV6_PORT_LABELING pr_info("Smack: IPv6 port labeling enabled.\n"); #endif #ifdef SMACK_IPV6_SECMARK_LABELING pr_info("Smack: IPv6 Netfilter enabled.\n"); #endif /* initialize the smack_known_list */ init_smack_known_list(); return 0; } /* * Smack requires early initialization in order to label * all processes and objects when they are created. */ DEFINE_LSM(smack) = { .name = "smack", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .blobs = &smack_blob_sizes, .init = smack_init, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. */ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/bio.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include "volumes.h" struct page; struct sector_ptr; struct btrfs_fs_info; enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, BTRFS_RBIO_PARITY_SCRUB, }; struct btrfs_raid_bio { struct btrfs_io_context *bioc; /* * While we're doing RMW on a stripe we put it into a hash table so we * can lock the stripe and merge more rbios into it. */ struct list_head hash_list; /* LRU list for the stripe cache */ struct list_head stripe_cache; /* For scheduling work in the helper threads */ struct work_struct work; /* * bio_list and bio_list_lock are used to add more bios into the stripe * in hopes of avoiding the full RMW */ struct bio_list bio_list; spinlock_t bio_list_lock; /* * Also protected by the bio_list_lock, the plug list is used by the * plugging code to collect partial bios while plugged. The stripe * locking code also uses it to hand off the stripe lock to the next * pending IO. */ struct list_head plug_list; /* Flags that tell us if it is safe to merge with this bio. */ unsigned long flags; /* * Set if we're doing a parity rebuild for a read from higher up, which * is handled differently from a parity rebuild as part of RMW. */ enum btrfs_rbio_ops operation; /* How many pages there are for the full stripe including P/Q */ u16 nr_pages; /* How many sectors there are for the full stripe including P/Q */ u16 nr_sectors; /* Number of data stripes (no p/q) */ u8 nr_data; /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ u8 stripe_npages; /* How many sectors there are for each stripe */ u8 stripe_nsectors; /* Stripe number that we're scrubbing */ u8 scrubp; /* * Size of all the bios in the bio_list. This helps us decide if the * rbio maps to a full stripe or not. */ int bio_list_bytes; refcount_t refs; atomic_t stripes_pending; wait_queue_head_t io_wait; /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; /* Allocated with stripe_nsectors-many bits for finish_*() calls */ unsigned long finish_pbitmap; /* * These are two arrays of pointers. We allocate the rbio big enough * to hold them both and setup their locations when the rbio is * allocated. */ /* * Pointers to pages that we allocated for reading/writing stripes * directly from the disk (including P/Q). */ struct page **stripe_pages; /* Pointers to the sectors in the bio_list, for faster lookup */ struct sector_ptr *bio_sectors; /* * For subpage support, we need to map each sector to above * stripe_pages. */ struct sector_ptr *stripe_sectors; /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; /* * The bitmap recording where IO errors happened. * Each bit is corresponding to one sector in either bio_sectors[] or * stripe_sectors[] array. * * The reason we don't use another bit in sector_ptr is, we have two * arrays of sectors, and a lot of IO can use sectors in both arrays. * Thus making it much harder to iterate. */ unsigned long *error_bitmap; /* * Checksum buffer if the rbio is for data. The buffer should cover * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; /* * Each bit represents if the corresponding sector has data csum found. * Should only cover data sectors (excluding P/Q sectors). */ unsigned long *csum_bitmap; }; /* * For trace event usage only. Records useful debug info for each bio submitted * by RAID56 to each physical device. * * No matter signed or not, (-1) is always the one indicating we can not grab * the proper stripe number. */ struct raid56_bio_trace_info { u64 devid; /* The offset inside the stripe. (<= STRIPE_LEN) */ u32 offset; /* * Stripe number. * 0 is the first data stripe, and nr_data for P stripe, * nr_data + 1 for Q stripe. * >= real_stripes for */ u8 stripe_nr; }; static inline int nr_data_stripes(const struct btrfs_chunk_map *map) { return map->num_stripes - btrfs_nr_parity_stripes(map->type); } static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) { return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); } #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) struct btrfs_device; void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, struct page **data_pages, u64 data_logical); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif
1 1 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 // SPDX-License-Identifier: GPL-2.0-only /* * Ioctl to read verity metadata * * Copyright 2021 Google LLC */ #include "fsverity_private.h" #include <linux/backing-dev.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> static int fsverity_read_merkle_tree(struct inode *inode, const struct fsverity_info *vi, void __user *buf, u64 offset, int length) { const struct fsverity_operations *vops = inode->i_sb->s_vop; u64 end_offset; unsigned int offs_in_page; pgoff_t index, last_index; int retval = 0; int err = 0; end_offset = min(offset + length, vi->tree_params.tree_size); if (offset >= end_offset) return 0; offs_in_page = offset_in_page(offset); last_index = (end_offset - 1) >> PAGE_SHIFT; /* * Iterate through each Merkle tree page in the requested range and copy * the requested portion to userspace. Note that the Merkle tree block * size isn't important here, as we are returning a byte stream; i.e., * we can just work with pages even if the tree block size != PAGE_SIZE. */ for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { unsigned long num_ra_pages = min_t(unsigned long, last_index - index + 1, inode->i_sb->s_bdi->io_pages); unsigned int bytes_to_copy = min_t(u64, end_offset - offset, PAGE_SIZE - offs_in_page); struct page *page; const void *virt; page = vops->read_merkle_tree_page(inode, index, num_ra_pages); if (IS_ERR(page)) { err = PTR_ERR(page); fsverity_err(inode, "Error %d reading Merkle tree page %lu", err, index); break; } virt = kmap_local_page(page); if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { kunmap_local(virt); put_page(page); err = -EFAULT; break; } kunmap_local(virt); put_page(page); retval += bytes_to_copy; buf += bytes_to_copy; offset += bytes_to_copy; if (fatal_signal_pending(current)) { err = -EINTR; break; } cond_resched(); offs_in_page = 0; } return retval ? retval : err; } /* Copy the requested portion of the buffer to userspace. */ static int fsverity_read_buffer(void __user *dst, u64 offset, int length, const void *src, size_t src_length) { if (offset >= src_length) return 0; src += offset; src_length -= offset; length = min_t(size_t, length, src_length); if (copy_to_user(dst, src, length)) return -EFAULT; return length; } static int fsverity_read_descriptor(struct inode *inode, void __user *buf, u64 offset, int length) { struct fsverity_descriptor *desc; size_t desc_size; int res; res = fsverity_get_descriptor(inode, &desc); if (res) return res; /* don't include the builtin signature */ desc_size = offsetof(struct fsverity_descriptor, signature); desc->sig_size = 0; res = fsverity_read_buffer(buf, offset, length, desc, desc_size); kfree(desc); return res; } static int fsverity_read_signature(struct inode *inode, void __user *buf, u64 offset, int length) { struct fsverity_descriptor *desc; int res; res = fsverity_get_descriptor(inode, &desc); if (res) return res; if (desc->sig_size == 0) { res = -ENODATA; goto out; } /* * Include only the builtin signature. fsverity_get_descriptor() * already verified that sig_size is in-bounds. */ res = fsverity_read_buffer(buf, offset, length, desc->signature, le32_to_cpu(desc->sig_size)); out: kfree(desc); return res; } /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from * @uarg: user pointer to fsverity_read_metadata_arg * * Return: length read on success, 0 on EOF, -errno on failure */ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) { struct inode *inode = file_inode(filp); const struct fsverity_info *vi; struct fsverity_read_metadata_arg arg; int length; void __user *buf; vi = fsverity_get_info(inode); if (!vi) return -ENODATA; /* not a verity file */ /* * Note that we don't have to explicitly check that the file is open for * reading, since verity files can only be opened for reading. */ if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (arg.__reserved) return -EINVAL; /* offset + length must not overflow. */ if (arg.offset + arg.length < arg.offset) return -EINVAL; /* Ensure that the return value will fit in INT_MAX. */ length = min_t(u64, arg.length, INT_MAX); buf = u64_to_user_ptr(arg.buf_ptr); switch (arg.metadata_type) { case FS_VERITY_METADATA_TYPE_MERKLE_TREE: return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, length); case FS_VERITY_METADATA_TYPE_DESCRIPTOR: return fsverity_read_descriptor(inode, buf, arg.offset, length); case FS_VERITY_METADATA_TYPE_SIGNATURE: return fsverity_read_signature(inode, buf, arg.offset, length); default: return -EINVAL; } } EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. ** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* ******************************************************************************/ /* * midcomms.c * * This is the appallingly named "mid-level" comms layer. It takes care about * deliver an on application layer "reliable" communication above the used * lowcomms transport layer. * * How it works: * * Each nodes keeps track of all send DLM messages in send_queue with a sequence * number. The receive will send an DLM_ACK message back for every DLM message * received at the other side. If a reconnect happens in lowcomms we will send * all unacknowledged dlm messages again. The receiving side might drop any already * received message by comparing sequence numbers. * * How version detection works: * * Due the fact that dlm has pre-configured node addresses on every side * it is in it's nature that every side connects at starts to transmit * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS * and their replies are the first messages which are exchanges. Due backwards * compatibility these messages are not covered by the midcomms re-transmission * layer. These messages have their own re-transmission handling in the dlm * application layer. The version field of every node will be set on these RCOM * messages as soon as they arrived and the node isn't yet part of the nodes * hash. There exists also logic to detect version mismatched if something weird * going on or the first messages isn't an expected one. * * Termination: * * The midcomms layer does a 4 way handshake for termination on DLM protocol * like TCP supports it with half-closed socket support. SCTP doesn't support * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be * interrupted by .e.g. tcp reset itself. Additional there exists the othercon * paradigm in lowcomms which cannot be easily without breaking backwards * compatibility. A node cannot send anything to another node when a DLM_FIN * message was send. There exists additional logic to print a warning if * DLM wants to do it. There exists a state handling like RFC 793 but reduced * to termination only. The event "member removal event" describes the cluster * manager removed the node from internal lists, at this point DLM does not * send any message to the other node. There exists two cases: * * 1. The cluster member was removed and we received a FIN * OR * 2. We received a FIN but the member was not removed yet * * One of these cases will do the CLOSE_WAIT to LAST_ACK change. * * * +---------+ * | CLOSED | * +---------+ * | add member/receive RCOM version * | detection msg * V * +---------+ * | ESTAB | * +---------+ * CLOSE | | rcv FIN * ------- | | ------- * +---------+ snd FIN / \ snd ACK +---------+ * | FIN |<----------------- ------------------>| CLOSE | * | WAIT-1 |------------------ | WAIT | * +---------+ rcv FIN \ +---------+ * | rcv ACK of FIN ------- | CLOSE | member * | -------------- snd ACK | ------- | removal * V x V snd FIN V event * +---------+ +---------+ +---------+ * |FINWAIT-2| | CLOSING | | LAST-ACK| * +---------+ +---------+ +---------+ * | rcv ACK of FIN | rcv ACK of FIN | * | rcv FIN -------------- | -------------- | * | ------- x V x V * \ snd ACK +---------+ +---------+ * ------------------------>| CLOSED | | CLOSED | * +---------+ +---------+ * * NOTE: any state can interrupted by midcomms_close() and state will be * switched to CLOSED in case of fencing. There exists also some timeout * handling when we receive the version detection RCOM messages which is * made by observation. * * Future improvements: * * There exists some known issues/improvements of the dlm handling. Some * of them should be done in a next major dlm version bump which makes * it incompatible with previous versions. * * Unaligned memory access: * * There exists cases when the dlm message buffer length is not aligned * to 8 byte. However seems nobody detected any problem with it. This * can be fixed in the next major version bump of dlm. * * Version detection: * * The version detection and how it's done is related to backwards * compatibility. There exists better ways to make a better handling. * However this should be changed in the next major version bump of dlm. * * Tail Size checking: * * There exists a message tail payload in e.g. DLM_MSG however we don't * check it against the message length yet regarding to the receive buffer * length. That need to be validated. * * Fencing bad nodes: * * At timeout places or weird sequence number behaviours we should send * a fencing request to the cluster manager. */ /* Debug switch to enable a 5 seconds sleep waiting of a termination. * This can be useful to test fencing while termination is running. * This requires a setup with only gfs2 as dlm user, so that the * last umount will terminate the connection. * * However it became useful to test, while the 5 seconds block in umount * just press the reset button. In a lot of dropping the termination * process can could take several seconds. */ #define DLM_DEBUG_FENCE_TERMINATION 0 #include <trace/events/dlm.h> #include <net/tcp.h> #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "memory.h" #include "lock.h" #include "util.h" #include "midcomms.h" /* init value for sequence numbers for testing purpose only e.g. overflows */ #define DLM_SEQ_INIT 0 /* 5 seconds wait to sync ending of dlm */ #define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) #define DLM_VERSION_NOT_SET 0 #define DLM_SEND_ACK_BACK_MSG_THRESHOLD 32 #define DLM_RECV_ACK_BACK_MSG_THRESHOLD (DLM_SEND_ACK_BACK_MSG_THRESHOLD * 8) struct midcomms_node { int nodeid; uint32_t version; atomic_t seq_send; atomic_t seq_next; /* These queues are unbound because we cannot drop any message in dlm. * We could send a fence signal for a specific node to the cluster * manager if queues hits some maximum value, however this handling * not supported yet. */ struct list_head send_queue; spinlock_t send_queue_lock; atomic_t send_queue_cnt; #define DLM_NODE_FLAG_CLOSE 1 #define DLM_NODE_FLAG_STOP_TX 2 #define DLM_NODE_FLAG_STOP_RX 3 atomic_t ulp_delivered; unsigned long flags; wait_queue_head_t shutdown_wait; /* dlm tcp termination state */ #define DLM_CLOSED 1 #define DLM_ESTABLISHED 2 #define DLM_FIN_WAIT1 3 #define DLM_FIN_WAIT2 4 #define DLM_CLOSE_WAIT 5 #define DLM_LAST_ACK 6 #define DLM_CLOSING 7 int state; spinlock_t state_lock; /* counts how many lockspaces are using this node * this refcount is necessary to determine if the * node wants to disconnect. */ int users; /* not protected by srcu, node_hash lifetime */ void *debugfs; struct hlist_node hlist; struct rcu_head rcu; }; struct dlm_mhandle { const union dlm_packet *inner_p; struct midcomms_node *node; struct dlm_opts *opts; struct dlm_msg *msg; bool committed; uint32_t seq; void (*ack_rcv)(struct midcomms_node *node); /* get_mhandle/commit srcu idx exchange */ int idx; struct list_head list; struct rcu_head rcu; }; static struct hlist_head node_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(nodes_lock); DEFINE_STATIC_SRCU(nodes_srcu); /* This mutex prevents that midcomms_close() is running while * stop() or remove(). As I experienced invalid memory access * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and * resetting machines. I will end in some double deletion in nodes * datastructure. */ static DEFINE_MUTEX(close_lock); struct kmem_cache *dlm_midcomms_cache_create(void) { return KMEM_CACHE(dlm_mhandle, 0); } static inline const char *dlm_state_str(int state) { switch (state) { case DLM_CLOSED: return "CLOSED"; case DLM_ESTABLISHED: return "ESTABLISHED"; case DLM_FIN_WAIT1: return "FIN_WAIT1"; case DLM_FIN_WAIT2: return "FIN_WAIT2"; case DLM_CLOSE_WAIT: return "CLOSE_WAIT"; case DLM_LAST_ACK: return "LAST_ACK"; case DLM_CLOSING: return "CLOSING"; default: return "UNKNOWN"; } } const char *dlm_midcomms_state(struct midcomms_node *node) { return dlm_state_str(node->state); } unsigned long dlm_midcomms_flags(struct midcomms_node *node) { return node->flags; } int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) { return atomic_read(&node->send_queue_cnt); } uint32_t dlm_midcomms_version(struct midcomms_node *node) { return node->version; } static struct midcomms_node *__find_node(int nodeid, int r) { struct midcomms_node *node; hlist_for_each_entry_rcu(node, &node_hash[r], hlist) { if (node->nodeid == nodeid) return node; } return NULL; } static void dlm_mhandle_release(struct rcu_head *rcu) { struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); dlm_lowcomms_put_msg(mh->msg); dlm_free_mhandle(mh); } static void dlm_mhandle_delete(struct midcomms_node *node, struct dlm_mhandle *mh) { list_del_rcu(&mh->list); atomic_dec(&node->send_queue_cnt); call_rcu(&mh->rcu, dlm_mhandle_release); } static void dlm_send_queue_flush(struct midcomms_node *node) { struct dlm_mhandle *mh; pr_debug("flush midcomms send queue of node %d\n", node->nodeid); rcu_read_lock(); spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { dlm_mhandle_delete(node, mh); } spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } static void midcomms_node_reset(struct midcomms_node *node) { pr_debug("reset node %d\n", node->nodeid); atomic_set(&node->seq_next, DLM_SEQ_INIT); atomic_set(&node->seq_send, DLM_SEQ_INIT); atomic_set(&node->ulp_delivered, 0); node->version = DLM_VERSION_NOT_SET; node->flags = 0; dlm_send_queue_flush(node); node->state = DLM_CLOSED; wake_up(&node->shutdown_wait); } static struct midcomms_node *nodeid2node(int nodeid) { return __find_node(nodeid, nodeid_hash(nodeid)); } int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr) { int ret, idx, r = nodeid_hash(nodeid); struct midcomms_node *node; ret = dlm_lowcomms_addr(nodeid, addr); if (ret) return ret; idx = srcu_read_lock(&nodes_srcu); node = __find_node(nodeid, r); if (node) { srcu_read_unlock(&nodes_srcu, idx); return 0; } srcu_read_unlock(&nodes_srcu, idx); node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) return -ENOMEM; node->nodeid = nodeid; spin_lock_init(&node->state_lock); spin_lock_init(&node->send_queue_lock); atomic_set(&node->send_queue_cnt, 0); INIT_LIST_HEAD(&node->send_queue); init_waitqueue_head(&node->shutdown_wait); node->users = 0; midcomms_node_reset(node); spin_lock_bh(&nodes_lock); hlist_add_head_rcu(&node->hlist, &node_hash[r]); spin_unlock_bh(&nodes_lock); node->debugfs = dlm_create_debug_comms_file(nodeid, node); return 0; } static int dlm_send_ack(int nodeid, uint32_t seq) { int mb_len = sizeof(struct dlm_header); struct dlm_header *m_header; struct dlm_msg *msg; char *ppc; msg = dlm_lowcomms_new_msg(nodeid, mb_len, &ppc, NULL, NULL); if (!msg) return -ENOMEM; m_header = (struct dlm_header *)ppc; m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); m_header->h_length = cpu_to_le16(mb_len); m_header->h_cmd = DLM_ACK; m_header->u.h_seq = cpu_to_le32(seq); dlm_lowcomms_commit_msg(msg); dlm_lowcomms_put_msg(msg); return 0; } static void dlm_send_ack_threshold(struct midcomms_node *node, uint32_t threshold) { uint32_t oval, nval; bool send_ack; /* let only send one user trigger threshold to send ack back */ do { oval = atomic_read(&node->ulp_delivered); send_ack = (oval > threshold); /* abort if threshold is not reached */ if (!send_ack) break; nval = 0; /* try to reset ulp_delivered counter */ } while (atomic_cmpxchg(&node->ulp_delivered, oval, nval) != oval); if (send_ack) dlm_send_ack(node->nodeid, atomic_read(&node->seq_next)); } static int dlm_send_fin(struct midcomms_node *node, void (*ack_rcv)(struct midcomms_node *node)) { int mb_len = sizeof(struct dlm_header); struct dlm_header *m_header; struct dlm_mhandle *mh; char *ppc; mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, &ppc); if (!mh) return -ENOMEM; set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); mh->ack_rcv = ack_rcv; m_header = (struct dlm_header *)ppc; m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); m_header->h_length = cpu_to_le16(mb_len); m_header->h_cmd = DLM_FIN; pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh, NULL, 0); return 0; } static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) { struct dlm_mhandle *mh; rcu_read_lock(); list_for_each_entry_rcu(mh, &node->send_queue, list) { if (before(mh->seq, seq)) { if (mh->ack_rcv) mh->ack_rcv(node); } else { /* send queue should be ordered */ break; } } spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { if (before(mh->seq, seq)) { dlm_mhandle_delete(node, mh); } else { /* send queue should be ordered */ break; } } spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) { spin_lock_bh(&node->state_lock); pr_debug("receive passive fin ack from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_LAST_ACK: /* DLM_CLOSED */ midcomms_node_reset(node); break; case DLM_CLOSED: /* not valid but somehow we got what we want */ wake_up(&node->shutdown_wait); break; default: spin_unlock_bh(&node->state_lock); log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; } spin_unlock_bh(&node->state_lock); } static void dlm_receive_buffer_3_2_trace(uint32_t seq, const union dlm_packet *p) { switch (p->header.h_cmd) { case DLM_MSG: trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message); break; case DLM_RCOM: trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom); break; default: break; } } static void dlm_midcomms_receive_buffer(const union dlm_packet *p, struct midcomms_node *node, uint32_t seq) { bool is_expected_seq; uint32_t oval, nval; do { oval = atomic_read(&node->seq_next); is_expected_seq = (oval == seq); if (!is_expected_seq) break; nval = oval + 1; } while (atomic_cmpxchg(&node->seq_next, oval, nval) != oval); if (is_expected_seq) { switch (p->header.h_cmd) { case DLM_FIN: spin_lock_bh(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: dlm_send_ack(node->nodeid, nval); /* passive shutdown DLM_LAST_ACK case 1 * additional we check if the node is used by * cluster manager events at all. */ if (node->users == 0) { node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 1\n", node->nodeid, dlm_state_str(node->state)); set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); dlm_send_fin(node, dlm_pas_fin_ack_rcv); } else { node->state = DLM_CLOSE_WAIT; pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); } break; case DLM_FIN_WAIT1: dlm_send_ack(node->nodeid, nval); node->state = DLM_CLOSING; set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: dlm_send_ack(node->nodeid, nval); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_LAST_ACK: /* probably remove_member caught it, do nothing */ break; default: spin_unlock_bh(&node->state_lock); log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; } spin_unlock_bh(&node->state_lock); break; default: WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer_3_2_trace(seq, p); dlm_receive_buffer(p, node->nodeid); atomic_inc(&node->ulp_delivered); /* unlikely case to send ack back when we don't transmit */ dlm_send_ack_threshold(node, DLM_RECV_ACK_BACK_MSG_THRESHOLD); break; } } else { /* retry to ack message which we already have by sending back * current node->seq_next number as ack. */ if (seq < oval) dlm_send_ack(node->nodeid, oval); log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", seq, oval, node->nodeid); } } static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, int nodeid) { int len = msglen; /* we only trust outer header msglen because * it's checked against receive buffer length. */ if (len < sizeof(struct dlm_opts)) return -1; len -= sizeof(struct dlm_opts); if (len < le16_to_cpu(p->opts.o_optlen)) return -1; len -= le16_to_cpu(p->opts.o_optlen); switch (p->opts.o_nextcmd) { case DLM_FIN: if (len < sizeof(struct dlm_header)) { log_print("fin too small: %d, will skip this message from node %d", len, nodeid); return -1; } break; case DLM_MSG: if (len < sizeof(struct dlm_message)) { log_print("msg too small: %d, will skip this message from node %d", msglen, nodeid); return -1; } break; case DLM_RCOM: if (len < sizeof(struct dlm_rcom)) { log_print("rcom msg too small: %d, will skip this message from node %d", len, nodeid); return -1; } break; default: log_print("unsupported o_nextcmd received: %u, will skip this message from node %d", p->opts.o_nextcmd, nodeid); return -1; } return 0; } static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) { uint16_t msglen = le16_to_cpu(p->header.h_length); struct midcomms_node *node; uint32_t seq; int ret, idx; idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); if (WARN_ON_ONCE(!node)) goto out; switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_2; wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, node->nodeid); spin_lock(&node->state_lock); switch (node->state) { case DLM_CLOSED: node->state = DLM_ESTABLISHED; pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; default: break; } spin_unlock(&node->state_lock); break; case DLM_VERSION_3_2: break; default: log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", DLM_VERSION_3_2, node->nodeid, node->version); goto out; } switch (p->header.h_cmd) { case DLM_RCOM: /* these rcom message we use to determine version. * they have their own retransmission handling and * are the first messages of dlm. * * length already checked. */ switch (p->rcom.rc_type) { case cpu_to_le32(DLM_RCOM_NAMES): fallthrough; case cpu_to_le32(DLM_RCOM_NAMES_REPLY): fallthrough; case cpu_to_le32(DLM_RCOM_STATUS): fallthrough; case cpu_to_le32(DLM_RCOM_STATUS_REPLY): break; default: log_print("unsupported rcom type received: %u, will skip this message from node %d", le32_to_cpu(p->rcom.rc_type), nodeid); goto out; } WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, nodeid); break; case DLM_OPTS: seq = le32_to_cpu(p->header.u.h_seq); ret = dlm_opts_check_msglen(p, msglen, nodeid); if (ret < 0) { log_print("opts msg too small: %u, will skip this message from node %d", msglen, nodeid); goto out; } p = (union dlm_packet *)((unsigned char *)p->opts.o_opts + le16_to_cpu(p->opts.o_optlen)); /* recheck inner msglen just if it's not garbage */ msglen = le16_to_cpu(p->header.h_length); switch (p->header.h_cmd) { case DLM_RCOM: if (msglen < sizeof(struct dlm_rcom)) { log_print("inner rcom msg too small: %u, will skip this message from node %d", msglen, nodeid); goto out; } break; case DLM_MSG: if (msglen < sizeof(struct dlm_message)) { log_print("inner msg too small: %u, will skip this message from node %d", msglen, nodeid); goto out; } break; case DLM_FIN: if (msglen < sizeof(struct dlm_header)) { log_print("inner fin too small: %u, will skip this message from node %d", msglen, nodeid); goto out; } break; default: log_print("unsupported inner h_cmd received: %u, will skip this message from node %d", msglen, nodeid); goto out; } dlm_midcomms_receive_buffer(p, node, seq); break; case DLM_ACK: seq = le32_to_cpu(p->header.u.h_seq); dlm_receive_ack(node, seq); break; default: log_print("unsupported h_cmd received: %u, will skip this message from node %d", p->header.h_cmd, nodeid); break; } out: srcu_read_unlock(&nodes_srcu, idx); } static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) { uint16_t msglen = le16_to_cpu(p->header.h_length); struct midcomms_node *node; int idx; idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; wake_up(&node->shutdown_wait); log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, node->nodeid); break; case DLM_VERSION_3_1: break; default: log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", DLM_VERSION_3_1, node->nodeid, node->version); srcu_read_unlock(&nodes_srcu, idx); return; } srcu_read_unlock(&nodes_srcu, idx); switch (p->header.h_cmd) { case DLM_RCOM: /* length already checked */ break; case DLM_MSG: if (msglen < sizeof(struct dlm_message)) { log_print("msg too small: %u, will skip this message from node %d", msglen, nodeid); return; } break; default: log_print("unsupported h_cmd received: %u, will skip this message from node %d", p->header.h_cmd, nodeid); return; } dlm_receive_buffer(p, nodeid); } int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) { const unsigned char *ptr = buf; const struct dlm_header *hd; uint16_t msglen; int ret = 0; while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or * less than dlm_header size. * * Some messages does not have a 8 byte length boundary yet * which can occur in a unaligned memory access of some dlm * messages. However this problem need to be fixed at the * sending side, for now it seems nobody run into architecture * related issues yet but it slows down some processing. * Fixing this issue should be scheduled in future by doing * the next major version bump. */ msglen = le16_to_cpu(hd->h_length); if (msglen > DLM_MAX_SOCKET_BUFSIZE || msglen < sizeof(struct dlm_header)) { log_print("received invalid length header: %u from node %d, will abort message parsing", msglen, nodeid); return -EBADMSG; } /* caller will take care that leftover * will be parsed next call with more data */ if (msglen > len) break; ret += msglen; len -= msglen; ptr += msglen; } return ret; } /* * Called from the low-level comms layer to process a buffer of * commands. */ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { const unsigned char *ptr = buf; const struct dlm_header *hd; uint16_t msglen; int ret = 0; while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; msglen = le16_to_cpu(hd->h_length); if (msglen > len) break; switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid); break; case cpu_to_le32(DLM_VERSION_3_2): dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid); break; default: log_print("received invalid version header: %u from node %d, will skip this message", le32_to_cpu(hd->h_version), nodeid); break; } ret += msglen; len -= msglen; ptr += msglen; } return ret; } void dlm_midcomms_unack_msg_resend(int nodeid) { struct midcomms_node *node; struct dlm_mhandle *mh; int idx, ret; idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } /* old protocol, we don't support to retransmit on failure */ switch (node->version) { case DLM_VERSION_3_2: break; default: srcu_read_unlock(&nodes_srcu, idx); return; } rcu_read_lock(); list_for_each_entry_rcu(mh, &node->send_queue, list) { if (!mh->committed) continue; ret = dlm_lowcomms_resend_msg(mh->msg); if (!ret) log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d", mh->seq, node->nodeid); } rcu_read_unlock(); srcu_read_unlock(&nodes_srcu, idx); } static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, uint32_t seq) { opts->o_header.h_cmd = DLM_OPTS; opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len); opts->o_header.u.h_seq = cpu_to_le32(seq); } static void midcomms_new_msg_cb(void *data) { struct dlm_mhandle *mh = data; atomic_inc(&mh->node->send_queue_cnt); spin_lock_bh(&mh->node->send_queue_lock); list_add_tail_rcu(&mh->list, &mh->node->send_queue); spin_unlock_bh(&mh->node->send_queue_lock); mh->seq = atomic_fetch_inc(&mh->node->seq_send); } static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, int len, char **ppc) { struct dlm_opts *opts; struct dlm_msg *msg; msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN, ppc, midcomms_new_msg_cb, mh); if (!msg) return NULL; opts = (struct dlm_opts *)*ppc; mh->opts = opts; /* add possible options here */ dlm_fill_opts_header(opts, len, mh->seq); *ppc += sizeof(*opts); mh->inner_p = (const union dlm_packet *)*ppc; return msg; } /* avoid false positive for nodes_srcu, unlock happens in * dlm_midcomms_commit_mhandle which is a must call if success */ #ifndef __CHECKER__ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, char **ppc) { struct midcomms_node *node; struct dlm_mhandle *mh; struct dlm_msg *msg; int idx; idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); if (WARN_ON_ONCE(!node)) goto err; /* this is a bug, however we going on and hope it will be resolved */ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); mh = dlm_allocate_mhandle(); if (!mh) goto err; mh->committed = false; mh->ack_rcv = NULL; mh->idx = idx; mh->node = node; switch (node->version) { case DLM_VERSION_3_1: msg = dlm_lowcomms_new_msg(nodeid, len, ppc, NULL, NULL); if (!msg) { dlm_free_mhandle(mh); goto err; } break; case DLM_VERSION_3_2: /* send ack back if necessary */ dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD); msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, ppc); if (!msg) { dlm_free_mhandle(mh); goto err; } break; default: dlm_free_mhandle(mh); WARN_ON_ONCE(1); goto err; } mh->msg = msg; /* keep in mind that is a must to call * dlm_midcomms_commit_msg() which releases * nodes_srcu using mh->idx which is assumed * here that the application will call it. */ return mh; err: srcu_read_unlock(&nodes_srcu, idx); return NULL; } #endif static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, const void *name, int namelen) { switch (mh->inner_p->header.h_cmd) { case DLM_MSG: trace_dlm_send_message(mh->node->nodeid, mh->seq, &mh->inner_p->message, name, namelen); break; case DLM_RCOM: trace_dlm_send_rcom(mh->node->nodeid, mh->seq, &mh->inner_p->rcom); break; default: /* nothing to trace */ break; } } static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, const void *name, int namelen) { /* nexthdr chain for fast lookup */ mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; mh->committed = true; dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen); dlm_lowcomms_commit_msg(mh->msg); } /* avoid false positive for nodes_srcu, lock was happen in * dlm_midcomms_get_mhandle */ #ifndef __CHECKER__ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen) { switch (mh->node->version) { case DLM_VERSION_3_1: srcu_read_unlock(&nodes_srcu, mh->idx); dlm_lowcomms_commit_msg(mh->msg); dlm_lowcomms_put_msg(mh->msg); /* mh is not part of rcu list in this case */ dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: /* held rcu read lock here, because we sending the * dlm message out, when we do that we could receive * an ack back which releases the mhandle and we * get a use after free. */ rcu_read_lock(); dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); rcu_read_unlock(); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); WARN_ON_ONCE(1); break; } } #endif int dlm_midcomms_start(void) { return dlm_lowcomms_start(); } void dlm_midcomms_stop(void) { dlm_lowcomms_stop(); } void dlm_midcomms_init(void) { int i; for (i = 0; i < CONN_HASH_SIZE; i++) INIT_HLIST_HEAD(&node_hash[i]); dlm_lowcomms_init(); } static void midcomms_node_release(struct rcu_head *rcu) { struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); dlm_send_queue_flush(node); kfree(node); } void dlm_midcomms_exit(void) { struct midcomms_node *node; int i, idx; idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { dlm_delete_debug_comms_file(node->debugfs); spin_lock(&nodes_lock); hlist_del_rcu(&node->hlist); spin_unlock(&nodes_lock); call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); } } srcu_read_unlock(&nodes_srcu, idx); dlm_lowcomms_exit(); } static void dlm_act_fin_ack_rcv(struct midcomms_node *node) { spin_lock_bh(&node->state_lock); pr_debug("receive active fin ack from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_FIN_WAIT1: node->state = DLM_FIN_WAIT2; pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_CLOSING: midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_CLOSED: /* not valid but somehow we got what we want */ wake_up(&node->shutdown_wait); break; default: spin_unlock_bh(&node->state_lock); log_print("%s: unexpected state: %d", __func__, node->state); WARN_ON_ONCE(1); return; } spin_unlock_bh(&node->state_lock); } void dlm_midcomms_add_member(int nodeid) { struct midcomms_node *node; int idx; idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } spin_lock_bh(&node->state_lock); if (!node->users) { pr_debug("receive add member from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: break; case DLM_CLOSED: node->state = DLM_ESTABLISHED; pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; default: /* some invalid state passive shutdown * was failed, we try to reset and * hope it will go on. */ log_print("reset node %d because shutdown stuck", node->nodeid); midcomms_node_reset(node); node->state = DLM_ESTABLISHED; break; } } node->users++; pr_debug("node %d users inc count %d\n", nodeid, node->users); spin_unlock_bh(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); } void dlm_midcomms_remove_member(int nodeid) { struct midcomms_node *node; int idx; idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); /* in case of dlm_midcomms_close() removes node */ if (!node) { srcu_read_unlock(&nodes_srcu, idx); return; } spin_lock_bh(&node->state_lock); /* case of dlm_midcomms_addr() created node but * was not added before because dlm_midcomms_close() * removed the node */ if (!node->users) { spin_unlock_bh(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); return; } node->users--; pr_debug("node %d users dec count %d\n", nodeid, node->users); /* hitting users count to zero means the * other side is running dlm_midcomms_stop() * we meet us to have a clean disconnect. */ if (node->users == 0) { pr_debug("receive remove member from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: break; case DLM_CLOSE_WAIT: /* passive shutdown DLM_LAST_ACK case 2 */ node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); dlm_send_fin(node, dlm_pas_fin_ack_rcv); break; case DLM_LAST_ACK: /* probably receive fin caught it, do nothing */ break; case DLM_CLOSED: /* already gone, do nothing */ break; default: log_print("%s: unexpected state: %d", __func__, node->state); break; } } spin_unlock_bh(&node->state_lock); srcu_read_unlock(&nodes_srcu, idx); } void dlm_midcomms_version_wait(void) { struct midcomms_node *node; int i, idx, ret; idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { ret = wait_event_timeout(node->shutdown_wait, node->version != DLM_VERSION_NOT_SET || node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) pr_debug("version wait timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); } } srcu_read_unlock(&nodes_srcu, idx); } static void midcomms_shutdown(struct midcomms_node *node) { int ret; /* old protocol, we don't wait for pending operations */ switch (node->version) { case DLM_VERSION_3_2: break; default: return; } spin_lock_bh(&node->state_lock); pr_debug("receive active shutdown for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); switch (node->state) { case DLM_ESTABLISHED: node->state = DLM_FIN_WAIT1; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); dlm_send_fin(node, dlm_act_fin_ack_rcv); break; case DLM_CLOSED: /* we have what we want */ break; default: /* busy to enter DLM_FIN_WAIT1, wait until passive * done in shutdown_wait to enter DLM_CLOSED. */ break; } spin_unlock_bh(&node->state_lock); if (DLM_DEBUG_FENCE_TERMINATION) msleep(5000); /* wait for other side dlm + fin */ ret = wait_event_timeout(node->shutdown_wait, node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); if (!ret) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); else pr_debug("active shutdown done for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); } void dlm_midcomms_shutdown(void) { struct midcomms_node *node; int i, idx; mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { midcomms_shutdown(node); } } dlm_lowcomms_shutdown(); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { midcomms_node_reset(node); } } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); } int dlm_midcomms_close(int nodeid) { struct midcomms_node *node; int idx, ret; idx = srcu_read_lock(&nodes_srcu); /* Abort pending close/remove operation */ node = nodeid2node(nodeid); if (node) { /* let shutdown waiters leave */ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); wake_up(&node->shutdown_wait); } srcu_read_unlock(&nodes_srcu, idx); synchronize_srcu(&nodes_srcu); mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid); if (!node) { srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); return dlm_lowcomms_close(nodeid); } ret = dlm_lowcomms_close(nodeid); dlm_delete_debug_comms_file(node->debugfs); spin_lock_bh(&nodes_lock); hlist_del_rcu(&node->hlist); spin_unlock_bh(&nodes_lock); srcu_read_unlock(&nodes_srcu, idx); /* wait that all readers left until flush send queue */ synchronize_srcu(&nodes_srcu); /* drop all pending dlm messages, this is fine as * this function get called when the node is fenced */ dlm_send_queue_flush(node); call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); mutex_unlock(&close_lock); return ret; } /* debug functionality to send raw dlm msg from user space */ struct dlm_rawmsg_data { struct midcomms_node *node; void *buf; }; static void midcomms_new_rawmsg_cb(void *data) { struct dlm_rawmsg_data *rd = data; struct dlm_header *h = rd->buf; switch (h->h_version) { case cpu_to_le32(DLM_VERSION_3_1): break; default: switch (h->h_cmd) { case DLM_OPTS: if (!h->u.h_seq) h->u.h_seq = cpu_to_le32(atomic_fetch_inc(&rd->node->seq_send)); break; default: break; } break; } } int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, int buflen) { struct dlm_rawmsg_data rd; struct dlm_msg *msg; char *msgbuf; rd.node = node; rd.buf = buf; msg = dlm_lowcomms_new_msg(node->nodeid, buflen, &msgbuf, midcomms_new_rawmsg_cb, &rd); if (!msg) return -ENOMEM; memcpy(msgbuf, buf, buflen); dlm_lowcomms_commit_msg(msg); return 0; }
113 113 111 2 3 3 4 4 4 4 4 4 47 109 156 155 1 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/acl.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * * Portions of this code from linux/fs/ext2/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/f2fs_fs.h> #include "f2fs.h" #include "xattr.h" #include "acl.h" static inline size_t f2fs_acl_size(int count) { if (count <= 4) { return sizeof(struct f2fs_acl_header) + count * sizeof(struct f2fs_acl_entry_short); } else { return sizeof(struct f2fs_acl_header) + 4 * sizeof(struct f2fs_acl_entry_short) + (count - 4) * sizeof(struct f2fs_acl_entry); } } static inline int f2fs_acl_count(size_t size) { ssize_t s; size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { if (size % sizeof(struct f2fs_acl_entry_short)) return -1; return size / sizeof(struct f2fs_acl_entry_short); } else { if (s % sizeof(struct f2fs_acl_entry)) return -1; return s / sizeof(struct f2fs_acl_entry) + 4; } } static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) { int i, count; struct posix_acl *acl; struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value; struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); const char *end = value + size; if (size < sizeof(struct f2fs_acl_header)) return ERR_PTR(-EINVAL); if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) return ERR_PTR(-EINVAL); count = f2fs_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (i = 0; i < count; i++) { if ((char *)entry > end) goto fail; acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[i].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry_short)); break; case ACL_USER: acl->a_entries[i].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry)); break; case ACL_GROUP: acl->a_entries[i].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry)); break; default: goto fail; } } if ((char *)entry != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + acl->a_count * sizeof(struct f2fs_acl_entry), GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION); entry = (struct f2fs_acl_entry *)(f2fs_acl + 1); for (i = 0; i < acl->a_count; i++) { entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm); switch (acl->a_entries[i].e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl->a_entries[i].e_uid)); entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry)); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl->a_entries[i].e_gid)); entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry_short)); break; default: goto fail; } } *size = f2fs_acl_size(acl->a_count); return (void *)f2fs_acl; fail: kfree(f2fs_acl); return ERR_PTR(-EINVAL); } static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, struct page *dpage) { int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; struct posix_acl *acl; int retval; if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval, dpage); } if (retval > 0) acl = f2fs_acl_from_disk(value, retval); else if (retval == -ENODATA) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) { if (rcu) return ERR_PTR(-ECHILD); return __f2fs_get_acl(inode, type, NULL); } static int f2fs_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { umode_t mode = inode->i_mode; int error; if (is_inode_flag_set(inode, FI_ACL_MODE)) mode = F2FS_I(inode)->i_acl_mode; error = posix_acl_equiv_mode(*acl, &mode); if (error < 0) return error; if (error == 0) *acl = NULL; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; } static int __f2fs_set_acl(struct mnt_idmap *idmap, struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { int name_index; void *value = NULL; size_t size = 0; int error; umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { error = f2fs_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); } break; case ACL_TYPE_DEFAULT: name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); return PTR_ERR(value); } } error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); kfree(value); if (!error) set_cached_acl(inode, type, acl); clear_inode_flag(inode, FI_ACL_MODE); return error; } int f2fs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; return __f2fs_set_acl(idmap, inode, type, acl, NULL); } /* * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create * are copied from posix_acl.c */ static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; if (acl) { clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), flags); if (clone) refcount_set(&clone->a_refcount, 1); } return clone; } static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; umode_t mode = *mode_p; int not_equiv = 0; /* assert(atomic_read(acl->a_refcount) == 1); */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: pa->e_perm &= (mode >> 6) | ~S_IRWXO; mode &= (pa->e_perm << 6) | ~S_IRWXU; break; case ACL_USER: case ACL_GROUP: not_equiv = 1; break; case ACL_GROUP_OBJ: group_obj = pa; break; case ACL_OTHER: pa->e_perm &= mode | ~S_IRWXO; mode &= pa->e_perm | ~S_IRWXO; break; case ACL_MASK: mask_obj = pa; not_equiv = 1; break; default: return -EIO; } } if (mask_obj) { mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; } else { if (!group_obj) return -EIO; group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; mode &= (group_obj->e_perm << 3) | ~S_IRWXG; } *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } static int f2fs_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl, struct page *dpage) { struct posix_acl *p; struct posix_acl *clone; int ret; *acl = NULL; *default_acl = NULL; if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; } if (IS_ERR(p)) return PTR_ERR(p); clone = f2fs_acl_clone(p, GFP_NOFS); if (!clone) { ret = -ENOMEM; goto release_acl; } ret = f2fs_acl_create_masq(clone, mode); if (ret < 0) goto release_clone; if (ret == 0) posix_acl_release(clone); else *acl = clone; if (!S_ISDIR(*mode)) posix_acl_release(p); else *default_acl = p; return 0; release_clone: posix_acl_release(clone); release_acl: posix_acl_release(p); return ret; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, struct page *dpage) { struct posix_acl *default_acl = NULL, *acl = NULL; int error; error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); if (error) return error; f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl, ipage); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); } else { inode->i_acl = NULL; } return error; }
21 2 167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 /* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: common low-level thread information accessors * * Copyright (C) 2002 David Howells (dhowells@redhat.com) * - Incorporating suggestions made by Linus Torvalds */ #ifndef _LINUX_THREAD_INFO_H #define _LINUX_THREAD_INFO_H #include <linux/types.h> #include <linux/limits.h> #include <linux/bug.h> #include <linux/restart_block.h> #include <linux/errno.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, * including <asm/current.h> can cause a circular dependency on some platforms. */ #include <asm/current.h> #define current_thread_info() ((struct thread_info *)current) #endif #include <linux/bitops.h> /* * For per-arch arch_within_stack_frames() implementations, defined in * asm/thread_info.h. */ enum { BAD_STACK = -1, NOT_STACK = 0, GOOD_FRAME, GOOD_STACK, }; #ifdef CONFIG_GENERIC_ENTRY enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) #endif #include <asm/thread_info.h> #ifndef TIF_NEED_RESCHED_LAZY #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY #error Inconsistent PREEMPT_LAZY #endif #define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif #ifdef __KERNEL__ #ifndef arch_set_restart_data #define arch_set_restart_data(restart) do { } while (0) #endif static inline long set_restart_fn(struct restart_block *restart, long (*fn)(struct restart_block *)) { restart->fn = fn; arch_set_restart_data(restart); return -ERESTART_RESTARTBLOCK; } #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions */ static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag, (unsigned long *)&ti->flags); } static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) { clear_bit(flag, (unsigned long *)&ti->flags); } static inline void update_ti_thread_flag(struct thread_info *ti, int flag, bool value) { if (value) set_ti_thread_flag(ti, flag); else clear_ti_thread_flag(ti, flag); } static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_set_bit(flag, (unsigned long *)&ti->flags); } static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_clear_bit(flag, (unsigned long *)&ti->flags); } static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag, (unsigned long *)&ti->flags); } /* * This may be used in noinstr code, and needs to be __always_inline to prevent * inadvertent instrumentation. */ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti) { return READ_ONCE(ti->flags); } #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ clear_ti_thread_flag(current_thread_info(), flag) #define update_thread_flag(flag, value) \ update_ti_thread_flag(current_thread_info(), flag, value) #define test_and_set_thread_flag(flag) \ test_and_set_ti_thread_flag(current_thread_info(), flag) #define test_and_clear_thread_flag(flag) \ test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) #define read_thread_flags() \ read_ti_thread_flags(current_thread_info()) #define read_task_thread_flags(t) \ read_ti_thread_flags(task_thread_info(t)) #ifdef CONFIG_GENERIC_ENTRY #define set_syscall_work(fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define test_syscall_work(fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define clear_syscall_work(fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define set_task_syscall_work(t, fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define test_task_syscall_work(t, fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define clear_task_syscall_work(t, fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #else /* CONFIG_GENERIC_ENTRY */ #define set_syscall_work(fl) \ set_ti_thread_flag(current_thread_info(), TIF_##fl) #define test_syscall_work(fl) \ test_ti_thread_flag(current_thread_info(), TIF_##fl) #define clear_syscall_work(fl) \ clear_ti_thread_flag(current_thread_info(), TIF_##fl) #define set_task_syscall_work(t, fl) \ set_ti_thread_flag(task_thread_info(t), TIF_##fl) #define test_task_syscall_work(t, fl) \ test_ti_thread_flag(task_thread_info(t), TIF_##fl) #define clear_task_syscall_work(t, fl) \ clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H static __always_inline bool tif_test_bit(int bit) { return arch_test_bit(bit, (unsigned long *)(&current_thread_info()->flags)); } #else static __always_inline bool tif_test_bit(int bit) { return test_bit(bit, (unsigned long *)(&current_thread_info()->flags)); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ static __always_inline bool tif_need_resched(void) { return tif_test_bit(TIF_NEED_RESCHED); } #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) { return 0; } #endif #ifdef CONFIG_HARDENED_USERCOPY extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); void __copy_overflow(int size, unsigned long count); static inline void copy_overflow(int size, unsigned long count) { if (IS_ENABLED(CONFIG_BUG)) __copy_overflow(size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif void arch_task_cache_init(void); /* for CONFIG_SH */ void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */
233 234 27 21 2 102 213 215 214 105 126 47 1 40 1 6 40 75 16 19 30 2 64 75 27 65 62 72 27 27 62 25 25 25 11 25 25 25 25 27 27 62 62 16 62 16 62 212 213 2 115 110 60 61 31 44 74 1 74 1 75 74 75 74 27 62 174 17 212 43 213 143 56 57 11 39 39 38 26 8 15 39 9 53 17 36 9 49 49 33 33 75 79 53 35 19 43 51 33 59 59 32 33 24 22 22 21 11 33 83 85 85 85 60 13 7 4 24 14 24 48 12 55 12 54 17 19 19 3 6 5 5 2 9 3 2 2 5 4 2 3 2 1 3 16 4 16 9 5 3 3 2 2 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/indirect.c * * from * * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie * (sct@redhat.com), 1993, 1998 */ #include "ext4_jbd2.h" #include "truncate.h" #include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> typedef struct { __le32 *p; __le32 key; struct buffer_head *bh; } Indirect; static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); p->bh = bh; } /** * ext4_block_to_path - parse the block number into array of offsets * @inode: inode in question (we are only interested in its superblock) * @i_block: block number to be parsed * @offsets: array to store the offsets in * @boundary: set this non-zero if the referred-to block is likely to be * followed (on disk) by an indirect block. * * To store the locations of file's data ext4 uses a data structure common * for UNIX filesystems - tree of pointers anchored in the inode, with * data blocks at leaves and indirect blocks in intermediate nodes. * This function translates the block number into path in that tree - * return value is the path length and @offsets[n] is the offset of * pointer to (n+1)th node in the nth one. If @block is out of range * (negative or too large) warning is printed and zero returned. * * Note: function doesn't find node addresses, so no IO is needed. All * we need to know is the capacity of indirect blocks (taken from the * inode->i_sb). */ /* * Portability note: the last comparison (check that we fit into triple * indirect block) is spelled differently, because otherwise on an * architecture with 32-bit longs and 8Kb pages we might get into trouble * if our filesystem had 8Kb blocks. We might use long long, but that would * kill us on x86. Oh, well, at least the sign propagation does not matter - * i_block would have to be negative in the very beginning, so we would not * get there at all. */ static int ext4_block_to_path(struct inode *inode, ext4_lblk_t i_block, ext4_lblk_t offsets[4], int *boundary) { int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); const long direct_blocks = EXT4_NDIR_BLOCKS, indirect_blocks = ptrs, double_blocks = (1 << (ptrs_bits * 2)); int n = 0; int final = 0; if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; } else if ((i_block -= indirect_blocks) < double_blocks) { offsets[n++] = EXT4_DIND_BLOCK; offsets[n++] = i_block >> ptrs_bits; offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { offsets[n++] = EXT4_TIND_BLOCK; offsets[n++] = i_block >> (ptrs_bits * 2); offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question * @depth: depth of the chain (1 - direct pointer, etc.) * @offsets: offsets of pointers in inode/indirect blocks * @chain: place to store the result * @err: here we store the error value * * Function fills the array of triples <key, p, bh> and returns %NULL * if everything went OK or the pointer to the last filled triple * (incomplete one) otherwise. Upon the return chain[i].key contains * the number of (i+1)-th block in the chain (as it is stored in memory, * i.e. little-endian 32-bit), chain[i].p contains the address of that * number (it points into struct inode for i==0 and into the bh->b_data * for i>0) and chain[i].bh points to the buffer_head of i-th indirect * block for i>0 and NULL for i==0. In other words, it holds the block * numbers of the chain, addresses they were taken from (and where we can * verify that chain did not change) and buffer_heads hosting these * numbers. * * Function stops when it stumbles upon zero pointer (absent block) * (pointer to last triple returned, *@err == 0) * or when it gets an IO error reading an indirect block * (ditto, *@err == -EIO) * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) */ static Indirect *ext4_get_branch(struct inode *inode, int depth, ext4_lblk_t *offsets, Indirect chain[4], int *err) { struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; unsigned int key; int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { key = le32_to_cpu(p->key); if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { /* the block was out of range */ ret = -EFSCORRUPTED; goto failure; } bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; } if (!bh_uptodate_or_lock(bh)) { if (ext4_read_bh(bh, 0, NULL, false) < 0) { put_bh(bh); goto failure; } /* validate block references */ if (ext4_check_indirect_blockref(inode, bh)) { put_bh(bh); goto failure; } } add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; } return NULL; failure: *err = ret; no_block: return p; } /** * ext4_find_near - find a place for allocation with sufficient locality * @inode: owner * @ind: descriptor of indirect block. * * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode * in the same block group. The PID is used here so that functionally related * files will be close-by on-disk. * * Caller must make sure that @ind is valid and will stay that way. */ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ return ext4_inode_to_goal_block(inode); } /** * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * * Normally this function find the preferred place for block allocation, * returns it. * Because this is only used for non-extent files, we limit the block nr * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { ext4_fsblk_t goal; /* * XXX need to get goal block from mballoc's data structures */ goal = ext4_find_near(inode, partial); goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; return goal; } /** * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks * @k: number of blocks need for indirect blocks * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block * * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet * then it's clear blocks on that path have not allocated */ if (k > 0) { /* right now we don't handle cross boundary allocation */ if (blks < blocks_to_boundary + 1) count += blks; else count += blocks_to_boundary + 1; return count; } count++; while (count < blks && count <= blocks_to_boundary && le32_to_cpu(*(branch[0].p + count)) == 0) { count++; } return count; } /** * ext4_alloc_branch() - allocate and set up a chain of blocks * @handle: handle for this transaction * @ar: structure describing the allocation request * @indirect_blks: number of allocated indirect blocks * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * * This function allocates blocks, zeroes out all but the last one, * links them into chain and (if we are synchronous) writes them to disk. * In other words, it prepares a branch that can be spliced onto the * inode. It stores the information about that chain in the branch[], in * the same format as ext4_get_branch() would do. We are calling it after * we had read the existing part of chain and partial points to the last * triple of that (one with zero ->key). Upon the exit we have the same * picture as after the successful ext4_get_block(), except that in one * place chain is disconnected - *branch->p is still zero (we did not * set the last link), but branch->key contains the number that should * be placed into *branch->p to fill that gap. * * If allocation fails we free all blocks we've allocated (and forget * their buffer_heads) and return the error value the from failed * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct ext4_allocation_request *ar, int indirect_blks, ext4_lblk_t *offsets, Indirect *branch) { struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); /* Simplify error cleanup... */ branch[i+1].bh = NULL; } if (err) { i--; goto failed; } branch[i].key = cpu_to_le32(new_blocks[i]); if (i == 0) continue; bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, ar->inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; } memset(bh->b_data, 0, bh->b_size); p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; b = new_blocks[i]; if (i == indirect_blks) len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } return 0; failed: if (i == indirect_blks) { /* Free data blocks */ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], ar->len, 0); i--; } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect * blocks. Buffer for new_blocks[i] is at branch[i+1].bh * (buffer at branch[0].bh is indirect block / inode already * existing before ext4_alloc_branch() was called). Also * because blocks are freshly allocated, we don't need to * revoke them which is why we don't set * EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, branch[i+1].bh, new_blocks[i], 1, branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } /** * ext4_splice_branch() - splice the allocated branch onto inode. * @handle: handle for this transaction * @ar: structure describing the allocation request * @where: location of missing link * @num: number of indirect blocks we are adding * * This function fills the missing link and does all housekeeping needed in * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ static int ext4_splice_branch(handle_t *handle, struct ext4_allocation_request *ar, Indirect *where, int num) { int i; int err = 0; ext4_fsblk_t current_block; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block * before the splice. */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, ar->inode->i_sb, where->bh, EXT4_JTR_NONE); if (err) goto err_out; } /* That's it */ *where->p = where->key; /* * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ /* had we spliced it onto indirect block? */ if (where->bh) { /* * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ ext4_debug("splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ err = ext4_mark_inode_dirty(handle, ar->inode); if (unlikely(err)) goto err_out; ext4_debug("splicing direct\n"); } return err; err_out: for (i = 1; i <= num; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), ar->len, 0); return err; } /* * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is * required, recheck the path, free and repeat if check fails, otherwise * set the last missing link (that will protect us from any truncate-generated * removals - all blocks on the path are immune now) and possibly force the * write on the parent block. * That has a nice additional property: no special recovery from the failed * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * * `handle' can be NULL if create == 0. * * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. * * The ext4_ind_get_blocks() function should be called with * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; int indirect_blks; int blocks_to_boundary = 0; int depth; int count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) goto out; partial = ext4_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); count++; /*map more blocks*/ while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) count++; else break; } goto got_it; } /* Next simple case - plain lookup failed */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; /* * Count number blocks in a subtree under 'partial'. At each * level we count number of complete empty subtrees beyond * current offset and then descend into the subtree only * partially beyond current offset. */ count = 0; for (i = partial - chain + 1; i < depth; i++) count = count * epb + (epb - offsets[i] - 1); count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); goto cleanup; } /* Failed read of indirect block */ if (err == -EIO) goto cleanup; /* * Okay, we need to do block allocation. */ if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); err = -EFSCORRUPTED; goto out; } /* Set up for the direct block allocation */ memset(&ar, 0, sizeof(ar)); ar.inode = inode; ar.logical = map->m_lblk; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ ar.len = ext4_blks_to_allocate(partial, indirect_blks, map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers * on the new chain if there is a failure, but that risks using * up transaction credits, especially for bitmaps where the * credits cannot be returned. Can we handle this somehow? We * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); map->m_len = count; if (count > blocks_to_boundary) map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ cleanup: while (partial > chain) { BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } out: trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } /* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { /* * With N contiguous data blocks, we need at most * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, * 2 dindirect blocks, and 1 tindirect block */ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, int *dropped) { int err; if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) return err; } err = ext4_mark_inode_dirty(handle, inode); if (unlikely(err)) return err; /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If * extend fails, we restart transaction. */ static int ext4_ind_truncate_ensure_credits(handle_t *handle, struct inode *inode, struct buffer_head *bh, int revoke_creds) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_blocks_for_truncate(inode), revoke_creds, ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); if (ret <= 0) return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(ret)) return ret; } return 0; } /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. * Linus? */ static inline int all_zeroes(__le32 *p, __le32 *q) { while (p < q) if (*p++) return 0; return 1; } /** * ext4_find_shared - find the indirect blocks for partial truncation. * @inode: inode in question * @depth: depth of the affected branch * @offsets: offsets of pointers in that branch (see ext4_block_to_path) * @chain: place to store the pointers to partial indirect blocks * @top: place to the (detached) top of branch * * This is a helper function used by ext4_truncate(). * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation * past the truncation point is possible until ext4_truncate() * finishes, we may safely do the latter, but top of branch may * require special attention - pageout below the truncation point * might try to populate it. * * We atomically detach the top of branch from the tree, store the * block number of its root in *@top, pointers to buffer_heads of * partially truncated blocks - in @chain[].bh and pointers to * their last elements that should not be removed - in * @chain[].p. Return value is the pointer to last filled element * of @chain. * * The work left to caller to do the actual freeing of subtrees: * a) free the subtree starting from *@top * b) free the subtrees whose roots are stored in * (@chain[i].p+1 .. end of @chain[i].bh->b_data) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ static Indirect *ext4_find_shared(struct inode *inode, int depth, ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; *top = 0; /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext4_get_branch(inode, k, offsets, chain, &err); /* Writer: pointers */ if (!partial) partial = chain + k-1; /* * If the branch acquired continuation since we've looked at it - * fine, it should all survive and (new) top doesn't belong to us. */ if (!partial->key && *partial->p) /* Writer: end */ goto no_top; for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our * branch should be detached before unlocking. However, if that rest * of branch is all ours and does not grow immediately from the inode * it's easier to cheat and just decrement partial->p. */ if (p == chain + k - 1 && p > chain) { p->p--; } else { *top = *p->p; /* Nope, don't do this in ext4. Must leave the tree intact */ #if 0 *p->p = 0; #endif } /* Writer: end */ while (partial > p) { brelse(partial->bh); partial--; } no_top: return partial; } /* * Zero a number of block pointers in either an inode or an indirect block. * If we restart the transaction we must again get write access to the * indirect block for further modification. * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. * * Return 0 on success, 1 on invalid block range * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block_to_free, unsigned long count, __le32 *first, __le32 *last) { __le32 *p; int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); return 1; } err = ext4_ind_truncate_ensure_credits(handle, inode, bh, ext4_free_data_revoke_credits(inode, count)); if (err < 0) goto out_err; for (p = first; p < last; p++) *p = 0; ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; out_err: ext4_std_error(inode->i_sb, err); return err; } /** * ext4_free_data - free a list of data blocks * @handle: handle for this transaction * @inode: inode we are dealing with * @this_bh: indirect buffer_head which contains *@first and *@last * @first: array of block numbers * @last: points immediately past the end of array * * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these * blocks are contiguous then releasing them at one time will only affect one * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't * actually use a lot of journal space. * * @this_bh will be %NULL if @first and @last point into the inode's direct * block pointers. */ static void ext4_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) return; } for (p = first; p < last; p++) { nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ if (count == 0) { block_to_free = nr; block_to_free_p = p; count = 1; } else if (nr == block_to_free + count) { count++; } else { err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err) break; block_to_free = nr; block_to_free_p = p; count = 1; } } } if (!err && count > 0) err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err < 0) /* fatal error */ return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this * point. However, if the data is corrupted and an indirect * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else EXT4_ERROR_INODE(inode, "circular indirect block detected at " "block %llu", (unsigned long long) this_bh->b_blocknr); } } /** * ext4_free_branches - free an array of branches * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @parent_bh: the buffer_head which contains *@first and *@last * @first: array of block numbers * @last: pointer immediately past the end of array * @depth: depth of the branches to free * * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ static void ext4_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { ext4_fsblk_t nr; __le32 *p; if (ext4_handle_is_aborted(handle)) return; if (depth--) { struct buffer_head *bh; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); p = last; while (--p >= first) { nr = le32_to_cpu(*p); if (!nr) continue; /* A hole */ if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", (unsigned long) nr, depth); break; } /* Go read the buffer for the next level down */ bh = ext4_sb_bread(inode->i_sb, nr, 0); /* * A read failure? Report error and clear slot * (should be rare). */ if (IS_ERR(bh)) { ext4_error_inode_block(inode, nr, -PTR_ERR(bh), "Read failure"); continue; } /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); brelse(bh); /* * Everything below this pointer has been * released. Now let this top-of-subtree go. * * We want the freeing of this indirect block to be * atomic in the journal with the updating of the * bitmap block which owns it. So make some room in * the journal. * * We zero the parent pointer *after* freeing its * pointee in the bitmaps, so if extend_transaction() * for some reason fails to put the bitmap changes and * the release into the same transaction, recovery * will merely complain about releasing a free block, * rather than leaking blocks. */ if (ext4_handle_is_aborted(handle)) return; if (ext4_ind_truncate_ensure_credits(handle, inode, NULL, ext4_free_metadata_revoke_credits( inode->i_sb, 1)) < 0) return; /* * The forget flag here is critical because if * we are journaling (and not doing data * journaling), we have to make sure a revoke * record is written to prevent the journal * replay from overwriting the (former) * indirect block if it gets reallocated as a * data block. This must happen in the same * transaction where the data blocks are * actually freed. */ ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* * The block which we have just freed is * pointed to by an indirect block: journal it */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, inode->i_sb, parent_bh, EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, inode, parent_bh); } } } } else { /* We have reached the bottom of the tree. */ BUFFER_TRACE(parent_bh, "free data blocks"); ext4_free_data(handle, inode, parent_bh, first, last); } } void ext4_ind_truncate(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) return; } ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the * on-disk inode. We do this via i_disksize, which is the value which * ext4 *really* writes onto the disk inode. */ ei->i_disksize = inode->i_size; if (last_block == max_block) { /* * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); goto do_indirects; } partial = ext4_find_shared(inode, n, offsets, chain, &nr); /* Kill the top of shared branch (not detached) */ if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; /* * We mark the inode dirty prior to restart, * and prior to stop. No need for it here. */ } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* Clear the ends of indirect blocks on the shared branch */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } } /** * ext4_ind_remove_space - remove space from the range * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @start: First block to remove * @end: One block after the last block to remove (exclusive) * * Free the blocks in the defined range (end is exclusive endpoint of * range). This is used by ext4_punch_hole(). */ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; unsigned blocksize = inode->i_sb->s_blocksize; max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (end >= max_block) end = max_block; if ((start >= end) || (start > max_block)) return 0; n = ext4_block_to_path(inode, start, offsets, NULL); n2 = ext4_block_to_path(inode, end, offsets2, NULL); BUG_ON(n > n2); if ((n == 1) && (n == n2)) { /* We're punching only within direct block range */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + offsets2[0]); return 0; } else if (n2 > n) { /* * Start and end are on a different levels so we're going to * free partial block at start, and partial block at end of * the range. If there are some levels in between then * do_indirects label will take care of that. */ if (n == 1) { /* * Start is at the direct block level, free * everything to the end of the level. */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + EXT4_NDIR_BLOCKS); goto end_range; } partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* * Clear the ends of indirect blocks on the shared branch * at the start of the range */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } end_range: partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* * Remember, end is exclusive so here we're at * the start of the next level we're not going * to free. Everything was covered by the start * of the range. */ goto do_indirects; } } else { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } /* * Clear the ends of indirect blocks on the shared branch * at the end of the range */ while (partial2 > chain2) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { int level = min(partial - chain, partial2 - chain2); int i; int subtree = 1; for (i = 0; i <= level; i++) { if (offsets[i] != offsets2[i]) { subtree = 0; break; } } if (!subtree) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } } if (!nr2) { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } while (partial > chain || partial2 > chain2) { int depth = (chain+n-1) - partial; int depth2 = (chain2+n2-1) - partial2; if (partial > chain && partial2 > chain2 && partial->bh->b_blocknr == partial2->bh->b_blocknr) { /* * We've converged on the same block. Clear the range, * then we're done. */ ext4_free_branches(handle, inode, partial->bh, partial->p + 1, partial2->p, (chain+n-1) - partial); goto cleanup; } /* * The start and end partial branches may not be at the same * level even though the punch happened within one level. So, we * give them a chance to arrive at the same level, then walk * them in step with each other until we converge on the same * block. */ if (partial > chain && depth <= depth2) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } } cleanup: while (p && p > chain) { BUFFER_TRACE(p->bh, "call brelse"); brelse(p->bh); p--; } while (p2 && p2 > chain2) { BUFFER_TRACE(p2->bh, "call brelse"); brelse(p2->bh); p2--; } return 0; do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: if (++n >= n2) break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } goto cleanup; }
9044 8769 8784 8754 15 8782 8701 48 24 9072 51 348 8814 8784 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-only /* * This implements the various checks for CONFIG_HARDENED_USERCOPY*, * which are designed to protect kernel memory from needless exposure * and overwrite under many unintended conditions. This code is based * on PAX_USERCOPY, which is: * * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source * Security Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/highmem.h> #include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include <asm/sections.h> #include "slab.h" /* * Checks if a given pointer and length is contained by the current * stack frame (if possible). * * Returns: * NOT_STACK: not at all on the stack * GOOD_FRAME: fully within a valid stack frame * GOOD_STACK: within the current stack (when can't frame-check exactly) * BAD_STACK: error condition (invalid stack position or bad stack frame) */ static noinline int check_stack_object(const void *obj, unsigned long len) { const void * const stack = task_stack_page(current); const void * const stackend = stack + THREAD_SIZE; int ret; /* Object is not on the stack at all. */ if (obj + len <= stack || stackend <= obj) return NOT_STACK; /* * Reject: object partially overlaps the stack (passing the * check above means at least one end is within the stack, * so if this check fails, the other end is outside the stack). */ if (obj < stack || stackend < obj + len) return BAD_STACK; /* Check if object is safely within a valid frame. */ ret = arch_within_stack_frames(stack, stackend, obj, len); if (ret) return ret; /* Finally, check stack depth if possible. */ #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { if ((void *)current_stack_pointer < obj + len) return BAD_STACK; } else { if (obj < (void *)current_stack_pointer) return BAD_STACK; } #endif return GOOD_STACK; } /* * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found * an unexpected state during a copy_from_user() or copy_to_user() call. * There are several checks being performed on the buffer by the * __check_object_size() function. Normal stack buffer usage should never * trip the checks, and kernel text addressing will always trip the check. * For cache objects, it is checking that only the whitelisted range of * bytes for a given cache is being accessed (via the cache's usersize and * useroffset fields). To adjust a cache whitelist, use the usercopy-aware * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) { pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", to_user ? "exposure" : "overwrite", to_user ? "from" : "to", name ? : "unknown?!", detail ? " '" : "", detail ? : "", detail ? "'" : "", offset, len); /* * For greater effect, it would be nice to do do_group_exit(), * but BUG() actually hooks all the lock-breaking and per-arch * Oops code, so that is used here instead. */ BUG(); } /* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ static bool overlaps(const unsigned long ptr, unsigned long n, unsigned long low, unsigned long high) { const unsigned long check_low = ptr; unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ if (check_low >= high || check_high <= low) return false; return true; } /* Is this address range in the kernel text area? */ static inline void check_kernel_text_object(const unsigned long ptr, unsigned long n, bool to_user) { unsigned long textlow = (unsigned long)_stext; unsigned long texthigh = (unsigned long)_etext; unsigned long textlow_linear, texthigh_linear; if (overlaps(ptr, n, textlow, texthigh)) usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n); /* * Some architectures have virtual memory mappings with a secondary * mapping of the kernel text, i.e. there is more than one virtual * kernel address that points to the kernel image. It is usually * when there is a separate linear physical memory mapping, in that * __pa() is not just the reverse of __va(). This can be detected * and checked: */ textlow_linear = (unsigned long)lm_alias(textlow); /* No different mapping: we're done. */ if (textlow_linear == textlow) return; /* Check the secondary mapping... */ texthigh_linear = (unsigned long)lm_alias(texthigh); if (overlaps(ptr, n, textlow_linear, texthigh_linear)) usercopy_abort("linear kernel text", NULL, to_user, ptr - textlow_linear, n); } static inline void check_bogus_address(const unsigned long ptr, unsigned long n, bool to_user) { /* Reject if object wraps past end of memory. */ if (ptr + (n - 1) < ptr) usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n); /* Reject if NULL or ZERO-allocation. */ if (ZERO_OR_NULL_PTR(ptr)) usercopy_abort("null address", NULL, to_user, ptr, n); } static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { unsigned long addr = (unsigned long)ptr; unsigned long offset; struct folio *folio; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); if (n > PAGE_SIZE - offset) usercopy_abort("kmap", NULL, to_user, offset, n); return; } if (is_vmalloc_addr(ptr) && !pagefault_disabled()) { struct vmap_area *area = find_vmap_area(addr); if (!area) usercopy_abort("vmalloc", "no area", to_user, 0, n); if (n > area->va_end - addr) { offset = addr - area->va_start; usercopy_abort("vmalloc", NULL, to_user, offset, n); } return; } if (!virt_addr_valid(ptr)) return; folio = virt_to_folio(ptr); if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ __check_heap_object(ptr, n, folio_slab(folio), to_user); } else if (folio_test_large(folio)) { offset = ptr - folio_address(folio); if (n > folio_size(folio) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } } static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address * - fully contained by stack (or stack frame, when available) * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { if (static_branch_unlikely(&bypass_usercopy_checks)) return; /* Skip all tests if size is zero. */ if (!n) return; /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: /* Object is not touching the current process stack. */ break; case GOOD_FRAME: case GOOD_STACK: /* * Object is either in the correct frame (when it * is possible to check) or just generally on the * process stack (when frame checking not available). */ return; default: usercopy_abort("process stack", NULL, to_user, #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER IS_ENABLED(CONFIG_STACK_GROWSUP) ? ptr - (void *)current_stack_pointer : (void *)current_stack_pointer - ptr, #else 0, #endif n); } /* Check for bad heap object. */ check_heap_object(ptr, n, to_user); /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { if (kstrtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; } __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { if (enable_checks == false) static_branch_enable(&bypass_usercopy_checks); return 1; } late_initcall(set_hardened_usercopy);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * This file contains device methods for creating, using and destroying * virtual HSR or PRP devices. */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> #include "hsr_device.h" #include "hsr_slave.h" #include "hsr_framereg.h" #include "hsr_main.h" #include "hsr_forward.h" static bool is_admin_up(struct net_device *dev) { return dev && (dev->flags & IFF_UP); } static bool is_slave_up(struct net_device *dev) { return dev && is_admin_up(dev) && netif_oper_up(dev); } static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { struct net_device *dev = master->dev; if (!is_admin_up(dev)) { netdev_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) netdev_set_operstate(dev, IF_OPER_UP); else netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) { struct hsr_port *port; ASSERT_RTNL(); hsr_for_each_port(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; } } netif_carrier_off(master->dev); return false; } static void hsr_check_announce(struct net_device *hsr_dev) { struct hsr_priv *hsr; hsr = netdev_priv(hsr_dev); if (netif_running(hsr_dev) && netif_oper_up(hsr_dev)) { /* Enable announce timer and start sending supervisory frames */ if (!timer_pending(&hsr->announce_timer)) { hsr->announce_count = 0; mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } if (hsr->redbox && !timer_pending(&hsr->announce_proxy_timer)) mod_timer(&hsr->announce_proxy_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL) / 2); } else { /* Deactivate the announce timer */ timer_delete(&hsr->announce_timer); if (hsr->redbox) timer_delete(&hsr->announce_proxy_timer); } } void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) { struct hsr_port *master; bool has_carrier; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); /* netif_stacked_transfer_operstate() cannot be used here since * it doesn't set IF_OPER_LOWERLAYERDOWN (?) */ has_carrier = hsr_check_carrier(master); hsr_set_operstate(master, has_carrier); hsr_check_announce(master->dev); } int hsr_get_max_mtu(struct hsr_priv *hsr) { unsigned int mtu_max; struct hsr_port *port; mtu_max = ETH_DATA_LEN; hsr_for_each_port(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); if (mtu_max < HSR_HLEN) return 0; return mtu_max - HSR_HLEN; } static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; hsr = netdev_priv(dev); if (new_mtu > hsr_get_max_mtu(hsr)) { netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", HSR_HLEN); return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; const char *designation = NULL; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: designation = "Slave A"; break; case HSR_PT_SLAVE_B: designation = "Slave B"; break; case HSR_PT_INTERLINK: designation = "Interlink"; break; default: designation = "Unknown"; } if (!is_slave_up(port->dev)) netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } if (!designation) netdev_warn(dev, "No slave devices configured\n"); return 0; } static int hsr_dev_close(struct net_device *dev) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: dev_uc_unsync(port->dev, dev); dev_mc_unsync(port->dev, dev); break; default: break; } } return 0; } static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, netdev_features_t features) { netdev_features_t mask; struct hsr_port *port; mask = features; /* Mask out all features that, if supported by one device, should be * enabled for all devices (see NETIF_F_ONE_FOR_ALL). * * Anything that's off in mask will not be enabled - so only things * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; hsr_for_each_port(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); return features; } static netdev_features_t hsr_fix_features(struct net_device *dev, netdev_features_t features) { struct hsr_priv *hsr = netdev_priv(dev); return hsr_features_recompute(hsr, features); } static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; skb_reset_mac_header(skb); skb_reset_mac_len(skb); spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); } else { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } return NETDEV_TX_OK; } static const struct header_ops hsr_header_ops = { .create = eth_header, .parse = eth_header_parse, }; static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra) { struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; int len; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload); /* skb size is same for PRP/HSR frames, only difference * being, for PRP it is a trailer and for HSR it is a * header. * RedBox might use @extra more bytes. */ skb = dev_alloc_skb(len + extra + hlen + tlen); if (!skb) return skb; skb_reserve(skb, hlen); skb->dev = master->dev; skb->priority = TC_PRIO_CONTROL; skb_reset_network_header(skb); skb_reset_transport_header(skb); if (dev_hard_header(skb, skb->dev, ETH_P_PRP, hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); skb_reset_mac_len(skb); return skb; out: kfree_skb(skb); return NULL; } static void send_hsr_supervision_frame(struct hsr_port *port, unsigned long *interval, const unsigned char *addr) { struct hsr_priv *hsr = port->hsr; __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_stlv; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; int extra = 0; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); if (hsr->announce_count < 3 && hsr->prot_version == 0) { type = HSR_TLV_ANNOUNCE; *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); hsr->announce_count++; } if (hsr->redbox) extra = sizeof(struct hsr_sup_tlv) + sizeof(struct hsr_sup_payload); skb = hsr_init_skb(port, extra); if (!skb) { netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n"); return; } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_bh(&hsr->seqnr_lock); if (hsr->prot_version > 0) { hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; } else { hsr_stag->sequence_nr = htons(hsr->sequence_nr); hsr->sequence_nr++; } hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA / SAN MAC from ProxyNodeTable */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, addr); if (hsr->redbox && hsr_is_node_in_db(&hsr->proxy_node_db, addr)) { hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv)); hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC; hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload); /* Payload: MacAddressRedBox */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox); } if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; } hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); return; } static void send_prp_supervision_frame(struct hsr_port *master, unsigned long *interval, const unsigned char *addr) { struct hsr_priv *hsr = master->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; skb = hsr_init_skb(master, 0); if (!skb) { netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n"); return; } *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_bh(&hsr->seqnr_lock); hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload); /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; } hsr_forward_skb(skb, master); spin_unlock_bh(&hsr->seqnr_lock); } /* Announce (supervision frame) timer function */ static void hsr_announce(struct timer_list *t) { struct hsr_priv *hsr; struct hsr_port *master; unsigned long interval; hsr = from_timer(hsr, t, announce_timer); rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr->proto_ops->send_sv_frame(master, &interval, master->dev->dev_addr); if (is_admin_up(master->dev)) mod_timer(&hsr->announce_timer, jiffies + interval); rcu_read_unlock(); } /* Announce (supervision frame) timer function for RedBox */ static void hsr_proxy_announce(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, announce_proxy_timer); struct hsr_port *interlink; unsigned long interval = 0; struct hsr_node *node; rcu_read_lock(); /* RedBOX sends supervisory frames to HSR network with MAC addresses * of SAN nodes stored in ProxyNodeTable. */ interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); if (!interlink) goto done; list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (hsr_addr_is_redbox(hsr, node->macaddress_A)) continue; hsr->proto_ops->send_sv_frame(interlink, &interval, node->macaddress_A); } if (is_admin_up(interlink->dev)) { if (!interval) interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); mod_timer(&hsr->announce_proxy_timer, jiffies + interval); } done: rcu_read_unlock(); } void hsr_del_ports(struct hsr_priv *hsr) { struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); if (port) hsr_del_port(port); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port) hsr_del_port(port); } static void hsr_set_rx_mode(struct net_device *dev) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: dev_mc_sync_multiple(port->dev, dev); dev_uc_sync_multiple(port->dev, dev); break; default: break; } } } static void hsr_change_rx_flags(struct net_device *dev, int change) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: if (change & IFF_ALLMULTI) dev_set_allmulti(port->dev, dev->flags & IFF_ALLMULTI ? 1 : -1); break; default: break; } } } static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { bool is_slave_a_added = false; bool is_slave_b_added = false; struct hsr_port *port; struct hsr_priv *hsr; int ret = 0; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; ret = vlan_vid_add(port->dev, proto, vid); switch (port->type) { case HSR_PT_SLAVE_A: if (ret) { /* clean up Slave-B */ netdev_err(dev, "add vid failed for Slave-A\n"); if (is_slave_b_added) vlan_vid_del(port->dev, proto, vid); return ret; } is_slave_a_added = true; break; case HSR_PT_SLAVE_B: if (ret) { /* clean up Slave-A */ netdev_err(dev, "add vid failed for Slave-B\n"); if (is_slave_a_added) vlan_vid_del(port->dev, proto, vid); return ret; } is_slave_b_added = true; break; default: break; } } return 0; } static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct hsr_port *port; struct hsr_priv *hsr; hsr = netdev_priv(dev); hsr_for_each_port(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: vlan_vid_del(port->dev, proto, vid); break; default: break; } } return 0; } static const struct net_device_ops hsr_device_ops = { .ndo_change_mtu = hsr_dev_change_mtu, .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, .ndo_change_rx_flags = hsr_change_rx_flags, .ndo_fix_features = hsr_fix_features, .ndo_set_rx_mode = hsr_set_rx_mode, .ndo_vlan_rx_add_vid = hsr_ndo_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = hsr_ndo_vlan_rx_kill_vid, }; static const struct device_type hsr_type = { .name = "hsr", }; static struct hsr_proto_ops hsr_ops = { .send_sv_frame = send_hsr_supervision_frame, .create_tagged_frame = hsr_create_tagged_frame, .get_untagged_frame = hsr_get_untagged_frame, .drop_frame = hsr_drop_frame, .fill_frame_info = hsr_fill_frame_info, .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, }; static struct hsr_proto_ops prp_ops = { .send_sv_frame = send_prp_supervision_frame, .create_tagged_frame = prp_create_tagged_frame, .get_untagged_frame = prp_get_untagged_frame, .drop_frame = prp_drop_frame, .fill_frame_info = prp_fill_frame_info, .handle_san_frame = prp_handle_san_frame, .update_san_info = prp_update_san_info, }; void hsr_dev_setup(struct net_device *dev) { eth_hw_addr_random(dev); ether_setup(dev); dev->min_mtu = 0; dev->header_ops = &hsr_header_ops; dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; /* Prevent recursive tx locking */ dev->lltx = true; /* Not sure about this. Taken from bridge code. netdevice.h says * it means "Does not change network namespaces". */ dev->netns_local = true; dev->needs_free_netdev = true; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_FILTER; dev->features = dev->hw_features; } /* Return true if dev is a HSR master; return false otherwise. */ bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } EXPORT_SYMBOL(is_hsr_master); struct net_device *hsr_get_port_ndev(struct net_device *ndev, enum hsr_port_type pt) { struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; hsr_for_each_port(hsr, port) if (port->type == pt) return port->dev; return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 }; int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], struct net_device *interlink, unsigned char multicast_spec, u8 protocol_version, struct netlink_ext_ack *extack) { bool unregister = false; struct hsr_priv *hsr; int res; hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); INIT_LIST_HEAD(&hsr->proxy_node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); /* initialize protocol specific functions */ if (protocol_version == PRP_V1) { /* For PRP, lan_id has most significant 3 bits holding * the net_id of PRP_LAN_ID */ hsr->net_id = PRP_LAN_ID << 1; hsr->proto_ops = &prp_ops; } else { hsr->proto_ops = &hsr_ops; } /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); if (res < 0) return res; spin_lock_init(&hsr->seqnr_lock); /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0); timer_setup(&hsr->announce_proxy_timer, hsr_proxy_announce, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; hsr->prot_version = protocol_version; /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack); if (res) goto err_add_master; /* HSR forwarding offload supported in lower device? */ if ((slave[0]->features & NETIF_F_HW_HSR_FWD) && (slave[1]->features & NETIF_F_HW_HSR_FWD)) hsr->fwd_offloaded = true; if ((slave[0]->features & NETIF_F_HW_VLAN_CTAG_FILTER) && (slave[1]->features & NETIF_F_HW_VLAN_CTAG_FILTER)) hsr_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; res = register_netdevice(hsr_dev); if (res) goto err_unregister; unregister = true; res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack); if (res) goto err_unregister; res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack); if (res) goto err_unregister; if (interlink) { res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); if (res) goto err_unregister; hsr->redbox = true; ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr); mod_timer(&hsr->prune_proxy_timer, jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); } hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); return 0; err_unregister: hsr_del_ports(hsr); err_add_master: hsr_del_self_node(hsr); if (unregister) unregister_netdevice(hsr_dev); return res; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_VSOCK_H #define _LINUX_VIRTIO_VSOCK_H #include <uapi/linux/virtio_vsock.h> #include <linux/socket.h> #include <net/sock.h> #include <net/af_vsock.h> #define VIRTIO_VSOCK_SKB_HEADROOM (sizeof(struct virtio_vsock_hdr)) struct virtio_vsock_skb_cb { bool reply; bool tap_delivered; u32 offset; }; #define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb)) static inline struct virtio_vsock_hdr *virtio_vsock_hdr(struct sk_buff *skb) { return (struct virtio_vsock_hdr *)skb->head; } static inline bool virtio_vsock_skb_reply(struct sk_buff *skb) { return VIRTIO_VSOCK_SKB_CB(skb)->reply; } static inline void virtio_vsock_skb_set_reply(struct sk_buff *skb) { VIRTIO_VSOCK_SKB_CB(skb)->reply = true; } static inline bool virtio_vsock_skb_tap_delivered(struct sk_buff *skb) { return VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered; } static inline void virtio_vsock_skb_set_tap_delivered(struct sk_buff *skb) { VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = true; } static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb) { VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false; } static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb) { u32 len; len = le32_to_cpu(virtio_vsock_hdr(skb)->len); if (len > 0) skb_put(skb, len); } static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) { struct sk_buff *skb; if (size < VIRTIO_VSOCK_SKB_HEADROOM) return NULL; skb = alloc_skb(size, mask); if (!skb) return NULL; skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM); return skb; } static inline void virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb) { spin_lock_bh(&list->lock); __skb_queue_head(list, skb); spin_unlock_bh(&list->lock); } static inline void virtio_vsock_skb_queue_tail(struct sk_buff_head *list, struct sk_buff *skb) { spin_lock_bh(&list->lock); __skb_queue_tail(list, skb); spin_unlock_bh(&list->lock); } static inline struct sk_buff *virtio_vsock_skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb; spin_lock_bh(&list->lock); skb = __skb_dequeue(list); spin_unlock_bh(&list->lock); return skb; } static inline void virtio_vsock_skb_queue_purge(struct sk_buff_head *list) { spin_lock_bh(&list->lock); __skb_queue_purge(list); spin_unlock_bh(&list->lock); } static inline size_t virtio_vsock_skb_len(struct sk_buff *skb) { return (size_t)(skb_end_pointer(skb) - skb->head); } #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) enum { VSOCK_VQ_RX = 0, /* for host to guest data */ VSOCK_VQ_TX = 1, /* for guest to host data */ VSOCK_VQ_EVENT = 2, VSOCK_VQ_MAX = 3, }; /* Per-socket state (accessed via vsk->trans) */ struct virtio_vsock_sock { struct vsock_sock *vsk; spinlock_t tx_lock; spinlock_t rx_lock; /* Protected by tx_lock */ u32 tx_cnt; u32 peer_fwd_cnt; u32 peer_buf_alloc; size_t bytes_unsent; /* Protected by rx_lock */ u32 fwd_cnt; u32 last_fwd_cnt; u32 rx_bytes; u32 buf_alloc; struct sk_buff_head rx_queue; u32 msg_count; }; struct virtio_vsock_pkt_info { u32 remote_cid, remote_port; struct vsock_sock *vsk; struct msghdr *msg; u32 pkt_len; u16 type; u16 op; u32 flags; bool reply; }; struct virtio_transport { /* This must be the first field */ struct vsock_transport transport; /* Takes ownership of the packet */ int (*send_pkt)(struct sk_buff *skb); /* Used in MSG_ZEROCOPY mode. Checks, that provided data * (number of buffers) could be transmitted with zerocopy * mode. If this callback is not implemented for the current * transport - this means that this transport doesn't need * extra checks and can perform zerocopy transmission by * default. */ bool (*can_msgzerocopy)(int bufs_num); }; ssize_t virtio_transport_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int type); int virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); int virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, struct msghdr *msg, size_t len); ssize_t virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags); s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk); void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume); int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, size_t target, bool *data_ready_now); int virtio_transport_notify_poll_out(struct vsock_sock *vsk, size_t target, bool *space_available_now); int virtio_transport_notify_recv_init(struct vsock_sock *vsk, size_t target, struct vsock_transport_recv_notify_data *data); int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, size_t target, struct vsock_transport_recv_notify_data *data); int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, size_t target, struct vsock_transport_recv_notify_data *data); int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, size_t target, ssize_t copied, bool data_read, struct vsock_transport_recv_notify_data *data); int virtio_transport_notify_send_init(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data); int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data); int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data); int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, struct vsock_transport_send_notify_data *data); void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); bool virtio_transport_stream_is_active(struct vsock_sock *vsk); bool virtio_transport_stream_allow(u32 cid, u32 port); int virtio_transport_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr); bool virtio_transport_dgram_allow(u32 cid, u32 port); int virtio_transport_connect(struct vsock_sock *vsk); int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); void virtio_transport_release(struct vsock_sock *vsk); ssize_t virtio_transport_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, size_t len); int virtio_transport_dgram_enqueue(struct vsock_sock *vsk, struct sockaddr_vm *remote_addr, struct msghdr *msg, size_t len); void virtio_transport_destruct(struct vsock_sock *vsk); void virtio_transport_recv_pkt(struct virtio_transport *t, struct sk_buff *skb); void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; } static inline u64 lru_pos_time(struct bpos pos) { return pos.inode & ~(~0ULL << LRU_TIME_BITS); } static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) { struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); EBUG_ON(time > LRU_TIME_MAX); EBUG_ON(lru_pos_id(pos) != lru_id); EBUG_ON(lru_pos_time(pos) != time); EBUG_ON(pos.offset != dev_bucket); return pos; } static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; if (lru_id == BCH_LRU_FRAGMENTATION_START) return BCH_LRU_fragmentation; return BCH_LRU_read; } int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); #define bch2_bkey_ops_lru ((struct bkey_ops) { \ .key_validate = bch2_lru_validate, \ .val_to_text = bch2_lru_to_text, \ .min_val_size = 8, \ }) int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); struct bkey_buf; int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */
90 6384 6391 6382 6391 266 4796 4796 4475 6281 6298 4404 4610 291 290 291 6265 6283 4323 4640 396 394 395 1033 1034 782 791 4 4 4 3 3 3 4 4 4 3 3 3 10 10 3 3 4 4 3 3 10 10 3 3 2 2 3 3 1 3 1 2 5 5 5 3 3 3 3 3 3 3 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0-only /* * rcuref - A scalable reference count implementation for RCU managed objects * * rcuref is provided to replace open coded reference count implementations * based on atomic_t. It protects explicitely RCU managed objects which can * be visible even after the last reference has been dropped and the object * is heading towards destruction. * * A common usage pattern is: * * get() * rcu_read_lock(); * p = get_ptr(); * if (p && !atomic_inc_not_zero(&p->refcnt)) * p = NULL; * rcu_read_unlock(); * return p; * * put() * if (!atomic_dec_return(&->refcnt)) { * remove_ptr(p); * kfree_rcu((p, rcu); * } * * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has * O(N^2) behaviour under contention with N concurrent operations. * * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales * better under contention. * * Why not refcount? * ================= * * In principle it should be possible to make refcount use the rcuref * scheme, but the destruction race described below cannot be prevented * unless the protected object is RCU managed. * * Theory of operation * =================== * * rcuref uses an unsigned integer reference counter. As long as the * counter value is greater than or equal to RCUREF_ONEREF and not larger * than RCUREF_MAXREF the reference is alive: * * ONEREF MAXREF SATURATED RELEASED DEAD NOREF * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF * <---valid --------> <-------saturation zone-------> <-----dead zone-----> * * The get() and put() operations do unconditional increments and * decrements. The result is checked after the operation. This optimizes * for the fast path. * * If the reference count is saturated or dead, then the increments and * decrements are not harmful as the reference count still stays in the * respective zones and is always set back to STATURATED resp. DEAD. The * zones have room for 2^28 racing operations in each direction, which * makes it practically impossible to escape the zones. * * Once the last reference is dropped the reference count becomes * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The * slowpath then tries to set the reference count from RCUREF_NOREF to * RCUREF_DEAD via a cmpxchg(). This opens a small window where a * concurrent rcuref_get() can acquire the reference count and bring it * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. * * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in * DEAD + 1, which is inside the dead zone. If that happens the reference * count is put back to DEAD. * * The actual race is possible due to the unconditional increment and * decrements in rcuref_get() and rcuref_put(): * * T1 T2 * get() put() * if (atomic_add_negative(-1, &ref->refcnt)) * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); * * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 * * As the result of T1's add is negative, the get() goes into the slow path * and observes refcnt being in the dead zone which makes the operation fail. * * Possible critical states: * * Context Counter References Operation * T1 0 1 init() * T2 1 2 get() * T1 0 1 put() * T2 -1 0 put() tries to mark dead * T1 0 1 get() * T2 0 1 put() mark dead fails * T1 -1 0 put() tries to mark dead * T1 DEAD 0 put() mark dead succeeds * T2 DEAD+1 0 get() fails and puts it back to DEAD * * Of course there are more complex scenarios, but the above illustrates * the working principle. The rest is left to the imagination of the * reader. * * Deconstruction race * =================== * * The release operation must be protected by prohibiting a grace period in * order to prevent a possible use after free: * * T1 T2 * put() get() * // ref->refcnt = ONEREF * if (!atomic_add_negative(-1, &ref->refcnt)) * return false; <- Not taken * * // ref->refcnt == NOREF * --> preemption * // Elevates ref->refcnt to ONEREF * if (!atomic_add_negative(1, &ref->refcnt)) * return true; <- taken * * if (put(&p->ref)) { <-- Succeeds * remove_pointer(p); * kfree_rcu(p, rcu); * } * * RCU grace period ends, object is freed * * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF * * This is prevented by disabling preemption around the put() operation as * that's in most kernel configurations cheaper than a rcu_read_lock() / * rcu_read_unlock() pair and in many cases even a NOOP. In any case it * prevents the grace period which keeps the object alive until all put() * operations complete. * * Saturation protection * ===================== * * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). * Once this is exceedded the reference count becomes stale by setting it * to RCUREF_SATURATED, which will cause a memory leak, but it prevents * wrap arounds which obviously cause worse problems than a memory * leak. When saturation is reached a warning is emitted. * * Race conditions * =============== * * All reference count increment/decrement operations are unconditional and * only verified after the fact. This optimizes for the good case and takes * the occasional race vs. a dead or already saturated refcount into * account. The saturation and dead zones are large enough to accomodate * for that. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object to increase the * reference count on will provide the ordering. For locked data * structures, its the lock acquire, for RCU/lockless data structures its * the dependent load. * * rcuref_get() provides a control dependency ordering future stores which * ensures that the object is not modified when acquiring a reference * fails. * * rcuref_put() provides release order, i.e. all prior loads and stores * will be issued before. It also provides a control dependency ordering * against the subsequent destruction of the object. * * If rcuref_put() successfully dropped the last reference and marked the * object DEAD it also provides acquire ordering. */ #include <linux/export.h> #include <linux/rcuref.h> /** * rcuref_get_slowpath - Slowpath of rcuref_get() * @ref: Pointer to the reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * False if the reference count was already marked dead * * True if the reference count is saturated, which prevents the * object from being deconstructed ever. */ bool rcuref_get_slowpath(rcuref_t *ref) { unsigned int cnt = atomic_read(&ref->refcnt); /* * If the reference count was already marked dead, undo the * increment so it stays in the middle of the dead zone and return * fail. */ if (cnt >= RCUREF_RELEASED) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * If it was saturated, warn and mark it so. In case the increment * was already on a saturated value restore the saturation * marker. This keeps it in the middle of the saturation zone and * prevents the reference count from overflowing. This leaks the * object memory, but prevents the obvious reference count overflow * damage. */ if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) atomic_set(&ref->refcnt, RCUREF_SATURATED); return true; } EXPORT_SYMBOL_GPL(rcuref_get_slowpath); /** * rcuref_put_slowpath - Slowpath of __rcuref_put() * @ref: Pointer to the reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool rcuref_put_slowpath(rcuref_t *ref) { unsigned int cnt = atomic_read(&ref->refcnt); /* Did this drop the last reference? */ if (likely(cnt == RCUREF_NOREF)) { /* * Carefully try to set the reference count to RCUREF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > RCUREF_MAXREF) atomic_set(&ref->refcnt, RCUREF_SATURATED); return false; } EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
115 115 9 9 3 3 3 3 3 209 209 209 209 209 208 209 208 209 8 8 8 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 // SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "CRED: " fmt #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> #include <linux/binfmts.h> #include <linux/cn_proc.h> #include <linux/uidgid.h> #if 0 #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__) #else #define kdebug(FMT, ...) \ do { \ if (0) \ no_printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__); \ } while (0) #endif static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; /* * The initial credentials for the initial task */ struct cred init_cred = { .usage = ATOMIC_INIT(4), .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, .sgid = GLOBAL_ROOT_GID, .euid = GLOBAL_ROOT_UID, .egid = GLOBAL_ROOT_GID, .fsuid = GLOBAL_ROOT_UID, .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, .cap_effective = CAP_FULL_SET, .cap_bset = CAP_FULL_SET, .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, .ucounts = &init_ucounts, }; /* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); kdebug("put_cred_rcu(%p)", cred); if (atomic_long_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %ld\n", cred, atomic_long_read(&cred->usage)); security_cred_free(cred); key_put(cred->session_keyring); key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); if (cred->ucounts) put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { kdebug("__put_cred(%p{%ld})", cred, atomic_long_read(&cred->usage)); BUG_ON(atomic_long_read(&cred->usage) != 0); BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); if (cred->non_rcu) put_cred_rcu(&cred->rcu); else call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); /* * Clean up a task's credentials when it exits */ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, atomic_long_read(&tsk->cred->usage)); real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; cred = (struct cred *) tsk->cred; tsk->cred = NULL; if (real_cred == cred) { put_cred_many(cred, 2); } else { put_cred(real_cred); put_cred(cred); } #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); tsk->cached_requested_key = NULL; #endif } /** * get_task_cred - Get another task's objective credentials * @task: The task to query * * Get the objective credentials of a task, pinning them so that they can't go * away. Accessing a task's credentials directly is not permitted. * * The caller must also make sure task doesn't get deleted, either by holding a * ref on task or by holding tasklist_lock to prevent it from being unlinked. */ const struct cred *get_task_cred(struct task_struct *task) { const struct cred *cred; rcu_read_lock(); do { cred = __task_cred((task)); BUG_ON(!cred); } while (!get_cred_rcu(cred)); rcu_read_unlock(); return cred; } EXPORT_SYMBOL(get_task_cred); /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. */ struct cred *cred_alloc_blank(void) { struct cred *new; new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); if (!new) return NULL; atomic_long_set(&new->usage, 1); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } /** * prepare_creds - Prepare a new set of credentials for modification * * Prepare a new set of task credentials for modification. A task's creds * shouldn't generally be modified directly, therefore this function is used to * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * * Preparation involves making a copy of the objective creds for modification. * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. */ struct cred *prepare_creds(void) { struct task_struct *task = current; const struct cred *old; struct cred *new; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_creds() alloc %p", new); old = task->cred; memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->session_keyring); key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { struct cred *new; new = prepare_creds(); if (!new) return new; #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; /* inherit the session keyring; new process keyring */ key_put(new->process_keyring); new->process_keyring = NULL; #endif new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; return new; } /* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new * set. * * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { struct cred *new; int ret; #ifdef CONFIG_KEYS_REQUEST_CACHE p->cached_requested_key = NULL; #endif if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && #endif clone_flags & CLONE_THREAD ) { p->real_cred = get_cred_many(p->cred, 2); kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } new = prepare_creds(); if (!new) return -ENOMEM; if (clone_flags & CLONE_NEWUSER) { ret = create_user_ns(new); if (ret < 0) goto error_put; ret = set_cred_ucounts(new); if (ret < 0) goto error_put; } #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ if (new->thread_keyring) { key_put(new->thread_keyring); new->thread_keyring = NULL; if (clone_flags & CLONE_THREAD) install_thread_keyring_to_cred(new); } /* The process keyring is only shared between the threads in a process; * anything outside of those threads doesn't inherit. */ if (!(clone_flags & CLONE_THREAD)) { key_put(new->process_keyring); new->process_keyring = NULL; } #endif p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; error_put: put_cred(new); return ret; } static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) { const struct user_namespace *set_ns = set->user_ns; const struct user_namespace *subset_ns = subset->user_ns; /* If the two credentials are in the same user namespace see if * the capabilities of subset are a subset of set. */ if (set_ns == subset_ns) return cap_issubset(subset->cap_permitted, set->cap_permitted); /* The credentials are in a different user namespaces * therefore one is a subset of the other only if a set is an * ancestor of subset and set->euid is owner of subset or one * of subsets ancestors. */ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { if ((set_ns == subset_ns->parent) && uid_eq(subset_ns->owner, set->euid)) return true; } return false; } /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace * the old set. Both the objective and the subjective credentials pointers are * updated. This function may not be called if the subjective credentials are * in an overridden state. * * This function eats the caller's reference to the new credentials. * * Always returns 0 thus allowing this function to be tail-called at the end * of, say, sys_setgid(). */ int commit_creds(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; kdebug("commit_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(task->cred != old); BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ if (!uid_eq(old->euid, new->euid) || !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, * the dumpability change must become visible before * the credential change; otherwise, a __ptrace_may_access() * racing with this change may be able to attach to a task it * shouldn't be able to attach to (as if the task had dropped * privileges without becoming nondumpable). * Pairs with a read barrier in __ptrace_may_access(). */ smp_wmb(); } /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); /* send notifications */ if (!uid_eq(new->uid, old->uid) || !uid_eq(new->euid, old->euid) || !uid_eq(new->suid, old->suid) || !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); if (!gid_eq(new->gid, old->gid) || !gid_eq(new->egid, old->egid) || !gid_eq(new->sgid, old->sgid) || !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ put_cred_many(old, 2); return 0; } EXPORT_SYMBOL(commit_creds); /** * abort_creds - Discard a set of credentials and unlock the current task * @new: The credentials that were going to be applied * * Discard a set of credentials that were under construction and unlock the * current task. */ void abort_creds(struct cred *new) { kdebug("abort_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); /** * cred_fscmp - Compare two credentials with respect to filesystem access. * @a: The first credential * @b: The second credential * * cred_cmp() will return zero if both credentials have the same * fsuid, fsgid, and supplementary groups. That is, if they will both * provide the same access to files based on mode/uid/gid. * If the credentials are different, then either -1 or 1 will * be returned depending on whether @a comes before or after @b * respectively in an arbitrary, but stable, ordering of credentials. * * Return: -1, 0, or 1 depending on comparison */ int cred_fscmp(const struct cred *a, const struct cred *b) { struct group_info *ga, *gb; int g; if (a == b) return 0; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { struct ucounts *new_ucounts, *old_ucounts = new->ucounts; /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; put_ucounts(old_ucounts); return 0; } /* * initialise the credentials stuff */ void __init cred_init(void) { /* allocate a slab in which we can store credentials */ cred_jar = KMEM_CACHE(cred, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } /** * prepare_kernel_cred - Prepare a set of credentials for a kernel service * @daemon: A userspace daemon to be used as a reference * * Prepare a set of credentials for a kernel service. This can then be used to * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * * @daemon is used to provide a base cred, with the security data derived from * that; if this is "&init_task", they'll be set to 0, no groups, full * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { const struct cred *old; struct cred *new; if (WARN_ON_ONCE(!daemon)) return NULL; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); *new = *old; new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS new->session_keyring = NULL; new->process_keyring = NULL; new->thread_keyring = NULL; new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); return new; error: put_cred(new); put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); /** * set_security_override - Set the security ID in a set of credentials * @new: The credentials to alter * @secid: The LSM security ID to set * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. */ int set_security_override(struct cred *new, u32 secid) { return security_kernel_act_as(new, secid); } EXPORT_SYMBOL(set_security_override); /** * set_security_override_from_ctx - Set the security ID in a set of credentials * @new: The credentials to alter * @secctx: The LSM security context to generate the security ID from. * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. The * security ID is specified in string form as a security context to be * interpreted by the LSM. */ int set_security_override_from_ctx(struct cred *new, const char *secctx) { u32 secid; int ret; ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); if (ret < 0) return ret; return set_security_override(new, secid); } EXPORT_SYMBOL(set_security_override_from_ctx); /** * set_create_files_as - Set the LSM file create context in a set of credentials * @new: The credentials to alter * @inode: The inode to take the context from * * Change the LSM file creation context in a set of credentials to be the same * as the object context of the specified inode, so that the new inodes have * the same MAC context as that inode. */ int set_create_files_as(struct cred *new, struct inode *inode) { if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as);
60 1 1 59 48 48 1 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * id.c */ /* * This file implements code to handle uids and gids. * * For space efficiency regular files store uid and gid indexes, which are * converted to 32-bit uids/gids using an id look up table. This table is * stored compressed into metadata blocks. A second index table is used to * locate these. This second index table for speed of access (and because it * is small) is read at mount time and cached in memory. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" /* * Map uid/gid index into real 32-bit uid/gid using the id look up table */ int squashfs_get_id(struct super_block *sb, unsigned int index, unsigned int *id) { struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_ID_BLOCK(index); int offset = SQUASHFS_ID_BLOCK_OFFSET(index); u64 start_block; __le32 disk_id; int err; if (index >= msblk->ids) return -EINVAL; start_block = le64_to_cpu(msblk->id_table[block]); err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, sizeof(disk_id)); if (err < 0) return err; *id = le32_to_cpu(disk_id); return 0; } /* * Read uncompressed id lookup table indexes from disk into memory */ __le64 *squashfs_read_id_index_table(struct super_block *sb, u64 id_table_start, u64 next_table, unsigned short no_ids) { unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids); int n; __le64 *table; u64 start, end; TRACE("In read_id_index_table, length %d\n", length); /* Sanity check values */ /* there should always be at least one id */ if (no_ids == 0) return ERR_PTR(-EINVAL); /* * The computed size of the index table (length bytes) should exactly * match the table start and end points */ if (length != (next_table - id_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, id_table_start, length); if (IS_ERR(table)) return table; /* * table[0], table[1], ... table[indexes - 1] store the locations * of the compressed id blocks. Each entry should be less than * the next (i.e. table[0] < table[1]), and the difference between them * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] * should be less than id_table_start, and again the difference * should be SQUASHFS_METADATA_SIZE or less */ for (n = 0; n < (indexes - 1); n++) { start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); if (start >= end || (end - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); if (start >= id_table_start || (id_table_start - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } return table; }
230 8893 508 508 231 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_XSTATE_H #define __X86_KERNEL_FPU_XSTATE_H #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); #endif static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) { /* * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } static inline u64 xstate_get_group_perm(bool guest) { struct fpu *fpu = &current->group_leader->thread.fpu; struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ perm = guest ? &fpu->guest_perm : &fpu->perm; return READ_ONCE(perm->__state_perm); } static inline u64 xstate_get_host_group_perm(void) { return xstate_get_group_perm(false); } enum xstate_copy_mode { XSTATE_COPY_FP, XSTATE_COPY_FX, XSTATE_COPY_XSAVE, }; struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; return fpu_kernel_cfg.independent_features; } /* * Update the value of PKRU register that was already pushed onto the signal frame. */ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru) { u64 xstate_bv; int err; if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) return 0; /* Mark PKRU as in-use so that it is restored correctly. */ xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU; err = __put_user(xstate_bv, &buf->header.xfeatures); if (err) return err; /* Update PKRU value in the userspace xsave buffer. */ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); } /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " #else #define REX_PREFIX #endif /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" #define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" /* * After this @err contains 0 on success or the trap number when the * operation raises an exception. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n\t" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor * states in addition to XSAVEC. * * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * * Use XSAVE as a fallback. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ asm volatile("1: " ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ "3:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile("1: " ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \ : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") #if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); #else static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } #endif #ifdef CONFIG_X86_64 static inline void xfd_set_state(u64 xfd) { wrmsrl(MSR_IA32_XFD, xfd); __this_cpu_write(xfd_state, xfd); } static inline void xfd_update_state(struct fpstate *fpstate) { if (fpu_state_size_dynamic()) { u64 xfd = fpstate->xfd; if (__this_cpu_read(xfd_state) != xfd) xfd_set_state(xfd); } } extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else static inline void xfd_set_state(u64 xfd) { } static inline void xfd_update_state(struct fpstate *fpstate) { } static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { return -EPERM; } #endif /* * Save processor xstate to xsave area. * * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features * and command line options. The choice is permanent until the next reboot. */ static inline void os_xsave(struct fpstate *fpstate) { u64 mask = fpstate->xfeatures; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON_FPU(!alternatives_patched); xfd_validate_state(fpstate, mask, false); XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); } /* * Restore processor xstate from xsave area. * * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. */ static inline void os_xrstor(struct fpstate *fpstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; xfd_validate_state(fpstate, mask, true); XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* Restore of supervisor state. Does not require XFD */ static inline void os_xrstor_supervisor(struct fpstate *fpstate) { u64 mask = xfeatures_mask_supervisor(); u32 lmask = mask; u32 hmask = mask >> 32; XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* * XSAVE itself always writes all requested xfeatures. Removing features * from the request bitmap reduces the features which are written. * Generate a mask of features which must be written to a sigframe. The * unset features can be optimized away and not written. * * This optimization is user-visible. Only use for states where * uninitialized sigframe contents are tolerable, like dynamic features. * * Users of buffers produced with this optimization must check XSTATE_BV * to determine which features have been optimized out. */ static inline u64 xfeatures_need_sigframe_write(void) { u64 xfeaures_to_write; /* In-use features must be written: */ xfeaures_to_write = xfeatures_in_use(); /* Also write all non-optimizable sigframe features: */ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_SIGFRAME_INITOPT; return xfeaures_to_write; } /* * Save xstate to user space xsave area. * * We don't use modified optimization because xrstor/xrstors might track * a different application. * * We don't use compacted format xsave area for backward compatibility for * old applications which don't understand the compacted format of the * xsave area. * * The caller has to zero buf::header before calling this because XSAVE* * does not touch the reserved fields in the header. */ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru) { /* * Include the features which are not xsaved/rstored by the kernel * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ struct fpstate *fpstate = current->thread.fpu.fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; int err; /* Optimize away writing unnecessary xfeatures: */ if (fpu_state_size_dynamic()) mask &= xfeatures_need_sigframe_write(); lmask = mask; hmask = mask >> 32; xfd_validate_state(fpstate, mask, false); stac(); XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); if (!err) err = update_pkru_in_sigframe(buf, mask, pkru); return err; } /* * Restore xstate from user space xsave area. */ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) { struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; int err; xfd_validate_state(current->thread.fpu.fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); clac(); return err; } /* * Restore xstate from kernel space xsave area, return an error code instead of * an exception. */ static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) { struct xregs_state *xstate = &fpstate->regs.xsave; u32 lmask = mask; u32 hmask = mask >> 32; int err; /* Ensure that XFD is up to date */ xfd_update_state(fpstate); if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } #endif
39 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_trusted.c * Handler for trusted extended attributes. * * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" static bool ext4_xattr_trusted_list(struct dentry *dentry) { return capable(CAP_SYS_ADMIN); } static int ext4_xattr_trusted_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } static int ext4_xattr_trusted_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } const struct xattr_handler ext4_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext4_xattr_trusted_list, .get = ext4_xattr_trusted_get, .set = ext4_xattr_trusted_set, };
23 23 23 8 16 30 30 23 23 8 16 18 26 14 14 14 14 10 8 4 8 8 4 4 1 4 4 4 4 4 8 8 5 4 4 28 28 22 4 6 3 3 6 4 4 3 3 3 3 3 27 4 4 1 4 3 4 4 4 4 4 4 3 4 26 8 18 18 18 29 29 7 21 27 1 7 21 28 26 24 4 4 4 12 13 13 13 29 29 29 29 4 27 3 25 25 19 19 3 25 26 26 7 20 26 29 29 29 29 26 29 26 26 29 27 3 37 37 37 9 29 3 3 27 8 21 8 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_refcount.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_bit.h" #include "xfs_alloc.h" #include "xfs_quota.h" #include "xfs_reflink.h" #include "xfs_iomap.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" #include "xfs_rtrefcount_btree.h" #include "xfs_rtalloc.h" #include "xfs_rtgroup.h" #include "xfs_metafile.h" /* * Copy on Write of Shared Blocks * * XFS must preserve "the usual" file semantics even when two files share * the same physical blocks. This means that a write to one file must not * alter the blocks in a different file; the way that we'll do that is * through the use of a copy-on-write mechanism. At a high level, that * means that when we want to write to a shared block, we allocate a new * block, write the data to the new block, and if that succeeds we map the * new block into the file. * * XFS provides a "delayed allocation" mechanism that defers the allocation * of disk blocks to dirty-but-not-yet-mapped file blocks as long as * possible. This reduces fragmentation by enabling the filesystem to ask * for bigger chunks less often, which is exactly what we want for CoW. * * The delalloc mechanism begins when the kernel wants to make a block * writable (write_begin or page_mkwrite). If the offset is not mapped, we * create a delalloc mapping, which is a regular in-core extent, but without * a real startblock. (For delalloc mappings, the startblock encodes both * a flag that this is a delalloc mapping, and a worst-case estimate of how * many blocks might be required to put the mapping into the BMBT.) delalloc * mappings are a reservation against the free space in the filesystem; * adjacent mappings can also be combined into fewer larger mappings. * * As an optimization, the CoW extent size hint (cowextsz) creates * outsized aligned delalloc reservations in the hope of landing out of * order nearby CoW writes in a single extent on disk, thereby reducing * fragmentation and improving future performance. * * D: --RRRRRRSSSRRRRRRRR--- (data fork) * C: ------DDDDDDD--------- (CoW fork) * * When dirty pages are being written out (typically in writepage), the * delalloc reservations are converted into unwritten mappings by * allocating blocks and replacing the delalloc mapping with real ones. * A delalloc mapping can be replaced by several unwritten ones if the * free space is fragmented. * * D: --RRRRRRSSSRRRRRRRR--- * C: ------UUUUUUU--------- * * We want to adapt the delalloc mechanism for copy-on-write, since the * write paths are similar. The first two steps (creating the reservation * and allocating the blocks) are exactly the same as delalloc except that * the mappings must be stored in a separate CoW fork because we do not want * to disturb the mapping in the data fork until we're sure that the write * succeeded. IO completion in this case is the process of removing the old * mapping from the data fork and moving the new mapping from the CoW fork to * the data fork. This will be discussed shortly. * * For now, unaligned directio writes will be bounced back to the page cache. * Block-aligned directio writes will use the same mechanism as buffered * writes. * * Just prior to submitting the actual disk write requests, we convert * the extents representing the range of the file actually being written * (as opposed to extra pieces created for the cowextsize hint) to real * extents. This will become important in the next step: * * D: --RRRRRRSSSRRRRRRRR--- * C: ------UUrrUUU--------- * * CoW remapping must be done after the data block write completes, * because we don't want to destroy the old data fork map until we're sure * the new block has been written. Since the new mappings are kept in a * separate fork, we can simply iterate these mappings to find the ones * that cover the file blocks that we just CoW'd. For each extent, simply * unmap the corresponding range in the data fork, map the new range into * the data fork, and remove the extent from the CoW fork. Because of * the presence of the cowextsize hint, however, we must be careful * only to remap the blocks that we've actually written out -- we must * never remap delalloc reservations nor CoW staging blocks that have * yet to be written. This corresponds exactly to the real extents in * the CoW fork: * * D: --RRRRRRrrSRRRRRRRR--- * C: ------UU--UUU--------- * * Since the remapping operation can be applied to an arbitrary file * range, we record the need for the remap step as a flag in the ioend * instead of declaring a new IO type. This is required for direct io * because we only have ioend for the whole dio, and we have to be able to * remember the presence of unwritten blocks and CoW blocks with a single * ioend structure. Better yet, the more ground we can cover with one * ioend, the better. */ /* * Given a file mapping for the data device, find the lowest-numbered run of * shared blocks within that mapping and return it in shared_offset/shared_len. * The offset is relative to the start of irec. * * If find_end_of_shared is true, return the longest contiguous extent of shared * blocks. If there are no shared extents, shared_offset and shared_len will be * set to 0; */ static int xfs_reflink_find_shared( struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_bmbt_irec *irec, xfs_extlen_t *shared_offset, xfs_extlen_t *shared_len, bool find_end_of_shared) { struct xfs_buf *agbp; struct xfs_perag *pag; struct xfs_btree_cur *cur; int error; xfs_agblock_t orig_bno, found_bno; pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) goto out; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, &found_bno, shared_len, find_end_of_shared); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); if (!error && *shared_len) *shared_offset = found_bno - orig_bno; out: xfs_perag_put(pag); return error; } /* * Given a file mapping for the rt device, find the lowest-numbered run of * shared blocks within that mapping and return it in shared_offset/shared_len. * The offset is relative to the start of irec. * * If find_end_of_shared is true, return the longest contiguous extent of shared * blocks. If there are no shared extents, shared_offset and shared_len will be * set to 0; */ static int xfs_reflink_find_rtshared( struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_bmbt_irec *irec, xfs_extlen_t *shared_offset, xfs_extlen_t *shared_len, bool find_end_of_shared) { struct xfs_rtgroup *rtg; struct xfs_btree_cur *cur; xfs_rgblock_t orig_bno; xfs_agblock_t found_bno; int error; BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK); /* * Note: this uses the not quite correct xfs_agblock_t type because * xfs_refcount_find_shared is shared between the RT and data device * refcount code. */ orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock); rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock)); xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); cur = xfs_rtrefcountbt_init_cursor(tp, rtg); error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, &found_bno, shared_len, find_end_of_shared); xfs_btree_del_cursor(cur, error); xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT); xfs_rtgroup_put(rtg); if (!error && *shared_len) *shared_offset = found_bno - orig_bno; return error; } /* * Trim the mapping to the next block where there's a change in the * shared/unshared status. More specifically, this means that we * find the lowest-numbered extent of shared blocks that coincides with * the given block mapping. If the shared extent overlaps the start of * the mapping, trim the mapping to the end of the shared extent. If * the shared region intersects the mapping, trim the mapping to the * start of the shared extent. If there are no shared regions that * overlap, just return the original extent. */ int xfs_reflink_trim_around_shared( struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared) { struct xfs_mount *mp = ip->i_mount; xfs_extlen_t shared_offset, shared_len; int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } trace_xfs_reflink_trim_around_shared(ip, irec); if (XFS_IS_REALTIME_INODE(ip)) error = xfs_reflink_find_rtshared(mp, NULL, irec, &shared_offset, &shared_len, true); else error = xfs_reflink_find_shared(mp, NULL, irec, &shared_offset, &shared_len, true); if (error) return error; if (!shared_len) { /* No shared blocks at all. */ *shared = false; } else if (!shared_offset) { /* * The start of this mapping points to shared space. Truncate * the mapping at the end of the shared region so that a * subsequent iteration starts at the start of the unshared * region. */ irec->br_blockcount = shared_len; *shared = true; } else { /* * There's a shared region that doesn't start at the beginning * of the mapping. Truncate the mapping at the start of the * shared extent so that a subsequent iteration starts at the * start of the shared region. */ irec->br_blockcount = shared_offset; *shared = false; } return 0; } int xfs_bmap_trim_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared) { /* We can't update any real extents in always COW mode. */ if (xfs_is_always_cow_inode(ip) && !isnullstartblock(imap->br_startblock)) { *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ return xfs_reflink_trim_around_shared(ip, imap, shared); } static int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; struct xfs_btree_cur *dummy_cur = NULL; int dummy_logflags; int error = 0; if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; do { if (got.br_startoff >= offset_fsb + count_fsb) break; if (got.br_state == XFS_EXT_NORM) continue; if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) return -EIO; xfs_trim_extent(&got, offset_fsb, count_fsb); if (!got.br_blockcount) continue; got.br_state = XFS_EXT_NORM; error = xfs_bmap_add_extent_unwritten_real(NULL, ip, XFS_COW_FORK, &icur, &dummy_cur, &got, &dummy_logflags); if (error) return error; } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ int xfs_reflink_convert_cow( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * Find the extent that maps the given range in the COW fork. Even if the extent * is not shared we might have a preallocation for it in the COW fork. If so we * use it that rather than trigger a new allocation. */ static int xfs_find_trim_cow_extent( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, bool *shared, bool *found) { xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_iext_cursor icur; *found = false; /* * If we don't find an overlapping extent, trim the range we need to * allocate to fit the hole we found. */ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap)) cmap->br_startoff = offset_fsb + count_fsb; if (cmap->br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, cmap->br_startoff - imap->br_startoff); return xfs_bmap_trim_cow(ip, imap, shared); } *shared = true; if (isnullstartblock(cmap->br_startblock)) { xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount); return 0; } /* real extent found - no need to allocate */ xfs_trim_extent(cmap, offset_fsb, count_fsb); *found = true; return 0; } static int xfs_reflink_convert_unwritten( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, bool convert_now) { xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; int error; /* * cmap might larger than imap due to cowextsize hint. */ xfs_trim_extent(cmap, offset_fsb, count_fsb); /* * COW fork extents are supposed to remain unwritten until we're ready * to initiate a disk write. For direct I/O we are going to write the * data and need the conversion, but for buffered writes we're done. */ if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, cmap); error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); if (!error) cmap->br_state = XFS_EXT_NORM; return error; } static int xfs_reflink_fill_cow_hole( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_filblks_t resaligned; unsigned int dblocks = 0, rblocks = 0; int nimaps; int error; bool found; resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); if (XFS_IS_REALTIME_INODE(ip)) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resaligned; } else { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); rblocks = 0; } xfs_iunlock(ip, *lockmode); *lockmode = 0; error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error) return error; *lockmode = XFS_ILOCK_EXCL; error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; if (found) { xfs_trans_cancel(tp); goto convert; } /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, &nimaps); if (error) goto out_trans_cancel; xfs_inode_set_cowblocks_tag(ip); error = xfs_trans_commit(tp); if (error) return error; convert: return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); out_trans_cancel: xfs_trans_cancel(tp); return error; } static int xfs_reflink_fill_delalloc( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int nimaps; int error; bool found; do { xfs_iunlock(ip, *lockmode); *lockmode = 0; error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, false, &tp); if (error) return error; *lockmode = XFS_ILOCK_EXCL; error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; if (found) { xfs_trans_cancel(tp); break; } ASSERT(isnullstartblock(cmap->br_startblock) || cmap->br_startblock == DELAYSTARTBLOCK); /* * Replace delalloc reservation with an unwritten extent. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, cmap->br_startoff, cmap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, &nimaps); if (error) goto out_trans_cancel; xfs_inode_set_cowblocks_tag(ip); error = xfs_trans_commit(tp); if (error) return error; } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); out_trans_cancel: xfs_trans_cancel(tp); return error; } /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode, bool convert_now) { int error; bool found; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (!ip->i_cowfp) { ASSERT(!xfs_is_reflink_inode(ip)); xfs_ifork_init_cow(ip); } error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) return error; /* CoW fork has a real extent */ if (found) return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); /* * CoW fork does not have an extent and data extent is shared. * Allocate a real extent in the CoW fork. */ if (cmap->br_startoff > imap->br_startoff) return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared, lockmode, convert_now); /* * CoW fork has a delalloc reservation. Replace it with a real extent. * There may or may not be a data fork mapping. */ if (isnullstartblock(cmap->br_startblock) || cmap->br_startblock == DELAYSTARTBLOCK) return xfs_reflink_fill_delalloc(ip, imap, cmap, shared, lockmode, convert_now); /* Shouldn't get here. */ ASSERT(0); return -EFSCORRUPTED; } /* * Cancel CoW reservations for some block range of an inode. * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. * * Caller must have already joined the inode to the current transaction. The * inode will be joined to the transaction returned to the caller. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, bool cancel_real) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; bool isrt = XFS_IS_REALTIME_INODE(ip); int error = 0; if (!xfs_inode_has_cow_data(ip)) return 0; if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) return 0; /* Walk backwards until we're out of the I/O range... */ while (got.br_startoff + got.br_blockcount > offset_fsb) { del = got; xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); /* Extent delete may have bumped ext forward */ if (!del.br_blockcount) { xfs_iext_prev(ifp, &icur); goto next_extent; } trace_xfs_reflink_cancel_cow(ip, &del); if (isnullstartblock(del.br_startblock)) { xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, &del); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(*tpp, isrt, del.br_startblock, del.br_blockcount); error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, XFS_AG_RESV_NONE, isrt ? XFS_FREE_EXTENT_REALTIME : 0); if (error) break; /* Roll the transaction */ error = xfs_defer_finish(tpp); if (error) break; /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); /* Remove the quota reservation */ xfs_quota_unreserve_blkres(ip, del.br_blockcount); } else { /* Didn't do anything, push cursor back. */ xfs_iext_prev(ifp, &icur); } next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } /* clear tag if cow fork is emptied */ if (!ifp->if_bytes) xfs_inode_clear_cowblocks_tag(ip); return error; } /* * Cancel CoW reservations for some byte range of an inode. * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) end_fsb = NULLFILEOFF; else end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* Start a rolling transaction to remove the mappings */ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 0, 0, 0, &tp); if (error) goto out; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, cancel_real); if (error) goto out_cancel; error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); return error; } #ifdef CONFIG_XFS_QUOTA /* * Update quota accounting for a remapping operation. When we're remapping * something from the CoW fork to the data fork, we must update the quota * accounting for delayed allocations. For remapping from the data fork to the * data fork, use regular block accounting. */ static inline void xfs_reflink_update_quota( struct xfs_trans *tp, struct xfs_inode *ip, bool is_cow, int64_t blocks) { unsigned int qflag; if (XFS_IS_REALTIME_INODE(ip)) { qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT; } else { qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT; } xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks); } #else # define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0) #endif /* * Remap part of the CoW fork into the data fork. * * We aim to remap the range starting at @offset_fsb and ending at @end_fsb * into the data fork; this function will remap what it can (at the end of the * range) and update @end_fsb appropriately. Each remap gets its own * transaction because we can end up merging and splitting bmbt blocks for * every remap operation and we'd like to keep the block reservation * requirements as low as possible. */ STATIC int xfs_reflink_end_cow_extent( struct xfs_inode *ip, xfs_fileoff_t *offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); unsigned int resblks; int nmaps; bool isrt = XFS_IS_REALTIME_INODE(ip); int error; resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; /* * Lock the inode. We have to ijoin without automatic unlock because * the lead transaction is the refcountbt record deletion; the data * fork update follows as a deferred log item. */ xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that * case we are done. */ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; goto out_cancel; } /* * Only remap real extents that contain data. With AIO, speculative * preallocations can leak into the range we are called upon, and we * need to skip them. Preserve @got for the eventual CoW fork * deletion; from now on @del represents the mapping that we're * actually remapping. */ while (!xfs_bmap_is_written_extent(&got)) { if (!xfs_iext_next_extent(ifp, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; goto out_cancel; } } del = got; xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); if (error) goto out_cancel; /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, &nmaps, 0); if (error) goto out_cancel; /* We can only remap the smaller of the two extent sizes. */ data.br_blockcount = min(data.br_blockcount, del.br_blockcount); del.br_blockcount = data.br_blockcount; trace_xfs_reflink_cow_remap_from(ip, &del); trace_xfs_reflink_cow_remap_to(ip, &data); if (xfs_bmap_is_real_extent(&data)) { /* * If the extent we're remapping is backed by storage (written * or not), unmap the extent and drop its refcount. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); xfs_refcount_decrease_extent(tp, isrt, &data); xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount); } else if (data.br_startblock == DELAYSTARTBLOCK) { int done; /* * If the extent we're remapping is a delalloc reservation, * we can use the regular bunmapi function to release the * incore state. Dropping the delalloc reservation takes care * of the quota reservation for us. */ error = xfs_bunmapi(NULL, ip, data.br_startoff, data.br_blockcount, 0, 1, &done); if (error) goto out_cancel; ASSERT(done); } /* Free the CoW orphan record. */ xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock, del.br_blockcount); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del); /* Charge this new data fork mapping to the on-disk quota. */ xfs_reflink_update_quota(tp, ip, true, del.br_blockcount); /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; /* Update the caller about how much progress we made. */ *offset_fsb = del.br_startoff + del.br_blockcount; return 0; out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * Remap parts of a file's data fork after a successful CoW. */ int xfs_reflink_end_cow( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count) { xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; int error = 0; trace_xfs_reflink_end_cow(ip, offset, count); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); /* * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped * extent. * * If we're being called by writeback then the pages will still * have PageWriteback set, which prevents races with reflink remapping * and truncate. Reflink remapping prevents races with writeback by * taking the iolock and mmaplock before flushing the pages and * remapping, which means there won't be any further writeback or page * cache dirtying until the reflink completes. * * We should never have two threads issuing writeback for the same file * region. There are also have post-eof checks in the writeback * preparation code so that we don't bother writing out pages that are * about to be truncated. * * If we're being called as part of directio write completion, the dio * count is still elevated, which reflink and truncate will wait for. * Reflink remapping takes the iolock and mmaplock and waits for * pending dio to finish, which should prevent any directio until the * remap completes. Multiple concurrent directio writes to the same * region are handled by end_cow processing only occurring for the * threads which succeed; the outcome of multiple overlapping direct * writes is not well defined anyway. * * It's possible that a buffered write and a direct write could collide * here (the buffered write stumbles in after the dio flushes and * invalidates the page cache and immediately queues writeback), but we * have never supported this 100%. If either disk write succeeds the * blocks will be remapped. */ while (end_fsb > offset_fsb && !error) error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb); if (error) trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); return error; } /* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the * staging extent, so callers must ensure that there are no cached inodes with * live CoW staging extents. */ int xfs_reflink_recover_cow( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; struct xfs_rtgroup *rtg = NULL; int error = 0; if (!xfs_has_reflink(mp)) return 0; while ((pag = xfs_perag_next(mp, pag))) { error = xfs_refcount_recover_cow_leftovers(pag_group(pag)); if (error) { xfs_perag_rele(pag); return error; } } while ((rtg = xfs_rtgroup_next(mp, rtg))) { error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg)); if (error) { xfs_rtgroup_rele(rtg); return error; } } return 0; } /* * Reflinking (Block) Ranges of Two Files Together * * First, ensure that the reflink flag is set on both inodes. The flag is an * optimization to avoid unnecessary refcount btree lookups in the write path. * * Now we can iteratively remap the range of extents (and holes) in src to the * corresponding ranges in dest. Let drange and srange denote the ranges of * logical blocks in dest and src touched by the reflink operation. * * While the length of drange is greater than zero, * - Read src's bmbt at the start of srange ("imap") * - If imap doesn't exist, make imap appear to start at the end of srange * with zero length. * - If imap starts before srange, advance imap to start at srange. * - If imap goes beyond srange, truncate imap to end at the end of srange. * - Punch (imap start - srange start + imap len) blocks from dest at * offset (drange start). * - If imap points to a real range of pblks, * > Increase the refcount of the imap's pblks * > Map imap's pblks into dest at the offset * (drange start + imap start - srange start) * - Advance drange and srange by (imap start - srange start + imap len) * * Finally, if the reflink made dest longer, update both the in-core and * on-disk file sizes. * * ASCII Art Demonstration: * * Let's say we want to reflink this source file: * * ----SSSSSSS-SSSSS----SSSSSS (src file) * <--------------------> * * into this destination file: * * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) * <--------------------> * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. * Observe that the range has different logical offsets in either file. * * Consider that the first extent in the source file doesn't line up with our * reflink range. Unmapping and remapping are separate operations, so we can * unmap more blocks from the destination file than we remap. * * ----SSSSSSS-SSSSS----SSSSSS * <-------> * --DDDDD---------DDDDD--DDD * <-------> * * Now remap the source extent into the destination file: * * ----SSSSSSS-SSSSS----SSSSSS * <-------> * --DDDDD--SSSSSSSDDDDD--DDD * <-------> * * Do likewise with the second hole and extent in our range. Holes in the * unmap range don't affect our operation. * * ----SSSSSSS-SSSSS----SSSSSS * <----> * --DDDDD--SSSSSSS-SSSSS-DDD * <----> * * Finally, unmap and remap part of the third extent. This will increase the * size of the destination file. * * ----SSSSSSS-SSSSS----SSSSSS * <-----> * --DDDDD--SSSSSSS-SSSSS----SSS * <-----> * * Once we update the destination file's i_size, we're done. */ /* * Ensure the reflink bit is set in both inodes. */ STATIC int xfs_reflink_set_inode_flag( struct xfs_inode *src, struct xfs_inode *dest) { struct xfs_mount *mp = src->i_mount; int error; struct xfs_trans *tp; if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) goto out_error; /* Lock both files against IO */ if (src->i_ino == dest->i_ino) xfs_ilock(src, XFS_ILOCK_EXCL); else xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); if (!xfs_is_reflink_inode(src)) { trace_xfs_reflink_set_inode_flag(src); xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); src->i_diflags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); xfs_ifork_init_cow(src); } else xfs_iunlock(src, XFS_ILOCK_EXCL); if (src->i_ino == dest->i_ino) goto commit_flags; if (!xfs_is_reflink_inode(dest)) { trace_xfs_reflink_set_inode_flag(dest); xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); dest->i_diflags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); xfs_ifork_init_cow(dest); } else xfs_iunlock(dest, XFS_ILOCK_EXCL); commit_flags: error = xfs_trans_commit(tp); if (error) goto out_error; return error; out_error: trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); return error; } /* * Update destination inode size & cowextsize hint, if necessary. */ int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) goto out_error; xfs_ilock(dest, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); if (newlen > i_size_read(VFS_I(dest))) { trace_xfs_reflink_update_inode_size(dest, newlen); i_size_write(VFS_I(dest), newlen); dest->i_disk_size = newlen; } if (cowextsize) { dest->i_cowextsize = cowextsize; dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) goto out_error; return error; out_error: trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); return error; } /* * Do we have enough reserve in this AG to handle a reflink? The refcount * btree already reserved all the space it needs, but the rmap btree can grow * infinitely, so we won't allow more reflinks when the AG is down to the * btree reserves. */ static int xfs_reflink_ag_has_free_space( struct xfs_mount *mp, struct xfs_inode *ip, xfs_fsblock_t fsb) { struct xfs_perag *pag; xfs_agnumber_t agno; int error = 0; if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(ip)) { struct xfs_rtgroup *rtg; xfs_rgnumber_t rgno; rgno = xfs_rtb_to_rgno(mp, fsb); rtg = xfs_rtgroup_get(mp, rgno); if (xfs_metafile_resv_critical(rtg_rmap(rtg))) error = -ENOSPC; xfs_rtgroup_put(rtg); return error; } agno = XFS_FSB_TO_AGNO(mp, fsb); pag = xfs_perag_get(mp, agno); if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) error = -ENOSPC; xfs_perag_put(pag); return error; } /* * Remap the given extent into the file. The dmap blockcount will be set to * the number of blocks that were actually remapped. */ STATIC int xfs_reflink_remap_extent( struct xfs_inode *ip, struct xfs_bmbt_irec *dmap, xfs_off_t new_isize) { struct xfs_bmbt_irec smap; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_off_t newlen; int64_t qdelta = 0; unsigned int dblocks, rblocks, resblks; bool quota_reserved = true; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); bool isrt = XFS_IS_REALTIME_INODE(ip); int iext_delta = 0; int nimaps; int error; /* * Start a rolling transaction to switch the mappings. * * Adding a written extent to the extent map can cause a bmbt split, * and removing a mapped extent from the extent can cause a bmbt split. * The two operations cannot both cause a split since they operate on * the same index in the bmap btree, so we only need a reservation for * one bmbt split if either thing is happening. However, we haven't * locked the inode yet, so we reserve assuming this is the case. * * The first allocation call tries to reserve enough space to handle * mapping dmap into a sparse part of the file plus the bmbt split. We * haven't locked the inode or read the existing mapping yet, so we do * not know for sure that we need the space. This should succeed most * of the time. * * If the first attempt fails, try again but reserving only enough * space to handle a bmbt split. This is the hard minimum requirement, * and we revisit quota reservations later when we know more about what * we're remapping. */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); if (XFS_IS_REALTIME_INODE(ip)) { dblocks = resblks; rblocks = dmap->br_blockcount; } else { dblocks = resblks + dmap->br_blockcount; rblocks = 0; } error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, false, &tp); if (error == -EDQUOT || error == -ENOSPC) { quota_reserved = false; error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, false, &tp); } if (error) goto out; /* * Read what's currently mapped in the destination file into smap. * If smap isn't a hole, we will have to remove it before we can add * dmap to the destination file. */ nimaps = 1; error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, &smap, &nimaps, 0); if (error) goto out_cancel; ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); smap_real = xfs_bmap_is_real_extent(&smap); /* * We can only remap as many blocks as the smaller of the two extent * maps, because we can only remap one extent at a time. */ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); ASSERT(dmap->br_blockcount == smap.br_blockcount); trace_xfs_reflink_remap_extent_dest(ip, &smap); /* * Two extents mapped to the same physical block must not have * different states; that's filesystem corruption. Move on to the next * extent if they're both holes or both the same physical extent. */ if (dmap->br_startblock == smap.br_startblock) { if (dmap->br_state != smap.br_state) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; } goto out_cancel; } /* If both extents are unwritten, leave them alone. */ if (dmap->br_state == XFS_EXT_UNWRITTEN && smap.br_state == XFS_EXT_UNWRITTEN) goto out_cancel; /* No reflinking if the AG of the dest mapping is low on space. */ if (dmap_written) { error = xfs_reflink_ag_has_free_space(mp, ip, dmap->br_startblock); if (error) goto out_cancel; } /* * Increase quota reservation if we think the quota block counter for * this file could increase. * * If we are mapping a written extent into the file, we need to have * enough quota block count reservation to handle the blocks in that * extent. We log only the delta to the quota block counts, so if the * extent we're unmapping also has blocks allocated to it, we don't * need a quota reservation for the extent itself. * * Note that if we're replacing a delalloc reservation with a written * extent, we have to take the full quota reservation because removing * the delalloc reservation gives the block count back to the quota * count. This is suboptimal, but the VFS flushed the dest range * before we started. That should have removed all the delalloc * reservations, but we code defensively. * * xfs_trans_alloc_inode above already tried to grab an even larger * quota reservation, and kicked off a blockgc scan if it couldn't. * If we can't get a potentially smaller quota reservation now, we're * done. */ if (!quota_reserved && !smap_real && dmap_written) { if (XFS_IS_REALTIME_INODE(ip)) { dblocks = 0; rblocks = dmap->br_blockcount; } else { dblocks = dmap->br_blockcount; rblocks = 0; } error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, false); if (error) goto out_cancel; } if (smap_real) ++iext_delta; if (dmap_written) ++iext_delta; error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, iext_delta); if (error) goto out_cancel; if (smap_real) { /* * If the extent we're unmapping is backed by storage (written * or not), unmap the extent and drop its refcount. */ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap); xfs_refcount_decrease_extent(tp, isrt, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { int done; /* * If the extent we're unmapping is a delalloc reservation, * we can use the regular bunmapi function to release the * incore state. Dropping the delalloc reservation takes care * of the quota reservation for us. */ error = xfs_bunmapi(NULL, ip, smap.br_startoff, smap.br_blockcount, 0, 1, &done); if (error) goto out_cancel; ASSERT(done); } /* * If the extent we're sharing is backed by written storage, increase * its refcount and map it into the file. */ if (dmap_written) { xfs_refcount_increase_extent(tp, isrt, dmap); xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap); qdelta += dmap->br_blockcount; } xfs_reflink_update_quota(tp, ip, false, qdelta); /* Update dest isize if needed. */ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); newlen = min_t(xfs_off_t, newlen, new_isize); if (newlen > i_size_read(VFS_I(ip))) { trace_xfs_reflink_update_inode_size(ip, newlen); i_size_write(VFS_I(ip), newlen); ip->i_disk_size = newlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } /* Commit everything and unlock. */ error = xfs_trans_commit(tp); goto out_unlock; out_cancel: xfs_trans_cancel(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); out: if (error) trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); return error; } /* Remap a range of one file to the other. */ int xfs_reflink_remap_blocks( struct xfs_inode *src, loff_t pos_in, struct xfs_inode *dest, loff_t pos_out, loff_t remap_len, loff_t *remapped) { struct xfs_bmbt_irec imap; struct xfs_mount *mp = src->i_mount; xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); xfs_filblks_t len; xfs_filblks_t remapped_len = 0; xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), XFS_MAX_FILEOFF); trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); while (len > 0) { unsigned int lock_mode; /* Read extent from the source file */ nimaps = 1; lock_mode = xfs_ilock_data_map_shared(src); error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); xfs_iunlock(src, lock_mode); if (error) break; /* * The caller supposedly flushed all dirty pages in the source * file range, which means that writeback should have allocated * or deleted all delalloc reservations in that range. If we * find one, that's a good sign that something is seriously * wrong here. */ ASSERT(nimaps == 1 && imap.br_startoff == srcoff); if (imap.br_startblock == DELAYSTARTBLOCK) { ASSERT(imap.br_startblock != DELAYSTARTBLOCK); xfs_bmap_mark_sick(src, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } trace_xfs_reflink_remap_extent_src(src, &imap); /* Remap into the destination file at the given offset. */ imap.br_startoff = destoff; error = xfs_reflink_remap_extent(dest, &imap, new_isize); if (error) break; if (fatal_signal_pending(current)) { error = -EINTR; break; } /* Advance drange/srange */ srcoff += imap.br_blockcount; destoff += imap.br_blockcount; len -= imap.br_blockcount; remapped_len += imap.br_blockcount; cond_resched(); } if (error) trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); *remapped = min_t(loff_t, remap_len, XFS_FSB_TO_B(src->i_mount, remapped_len)); return error; } /* * If we're reflinking to a point past the destination file's EOF, we must * zero any speculative post-EOF preallocations that sit between the old EOF * and the destination file offset. */ static int xfs_reflink_zero_posteof( struct xfs_inode *ip, loff_t pos) { loff_t isize = i_size_read(VFS_I(ip)); if (pos <= isize) return 0; trace_xfs_zero_eof(ip, isize, pos - isize); return xfs_zero_range(ip, isize, pos - isize, NULL); } /* * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file will * be truncated, and any leases on the out file will have been broken. This * function borrows heavily from xfs_file_aio_write_checks. * * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't * checked that the bytes beyond EOF physically match. Hence we cannot use the * EOF block in the source dedupe range because it's not a complete block match, * hence can introduce a corruption into the file that has it's block replaced. * * In similar fashion, the VFS file cloning also allows partial EOF blocks to be * "block aligned" for the purposes of cloning entire files. However, if the * source file range includes the EOF block and it lands within the existing EOF * of the destination file, then we can expose stale data from beyond the source * file EOF in the destination file. * * XFS doesn't support partial block sharing, so in both cases we have check * these cases ourselves. For dedupe, we can simply round the length to dedupe * down to the previous whole block and ignore the partial EOF block. While this * means we can't dedupe the last block of a file, this is an acceptible * tradeoff for simplicity on implementation. * * For cloning, we want to share the partial EOF block if it is also the new EOF * block of the destination file. If the partial EOF block lies inside the * existing destination EOF, then we have to abort the clone to avoid exposing * stale data in the destination file. Hence we reject these clone attempts with * -EINVAL in this case. */ int xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); int ret; /* Lock both files against IO */ ret = xfs_ilock2_io_mmap(src, dest); if (ret) return ret; /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; /* Can't reflink between data and rt volumes */ if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest)) goto out_unlock; /* Don't share DAX file data with non-DAX file. */ if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; if (!IS_DAX(inode_in)) ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); else ret = dax_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) goto out_unlock; /* * Zero existing post-eof speculative preallocations in the destination * file. */ ret = xfs_reflink_zero_posteof(dest, pos_out); if (ret) goto out_unlock; /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; /* * If pos_out > EOF, we may have dirtied blocks between EOF and * pos_out. In that case, we need to extend the flush and unmap to cover * from EOF to the end of the copy length. */ if (pos_out > XFS_ISIZE(dest)) { loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); } else { ret = xfs_flush_unmap_range(dest, pos_out, *len); } if (ret) goto out_unlock; xfs_iflags_set(src, XFS_IREMAPPING); if (inode_in != inode_out) xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); return 0; out_unlock: xfs_iunlock2_io_mmap(src, dest); return ret; } /* Does this inode need the reflink flag? */ int xfs_reflink_inode_has_shared_extents( struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared) { struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_iext_cursor icur; bool found; int error; ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) return error; *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { xfs_extlen_t shared_offset, shared_len; if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) goto next; if (XFS_IS_REALTIME_INODE(ip)) error = xfs_reflink_find_rtshared(mp, tp, &got, &shared_offset, &shared_len, false); else error = xfs_reflink_find_shared(mp, tp, &got, &shared_offset, &shared_len, false); if (error) return error; /* Is there still a shared block here? */ if (shared_len) { *has_shared = true; return 0; } next: found = xfs_iext_next_extent(ifp, &icur, &got); } return 0; } /* * Clear the inode reflink flag if there are no shared extents. * * The caller is responsible for joining the inode to the transaction passed in. * The inode will be joined to the transaction that is returned to the caller. */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, struct xfs_trans **tpp) { bool needs_flag; int error = 0; ASSERT(xfs_is_reflink_inode(ip)); if (!xfs_can_free_cowblocks(ip)) return 0; error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); if (error || needs_flag) return error; /* * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, true); if (error) return error; /* Clear the inode flag. */ trace_xfs_reflink_unset_inode_flag(ip); ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; } /* * Clear the inode reflink flag if there are no shared extents and the size * hasn't changed. */ STATIC int xfs_reflink_try_clear_inode_flag( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = 0; /* Start a rolling transaction to remove the mappings */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); error = xfs_reflink_clear_inode_flag(ip, &tp); if (error) goto cancel; error = xfs_trans_commit(tp); if (error) goto out; xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; cancel: xfs_trans_cancel(tp); out: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * Pre-COW all shared blocks within a given byte range of a file and turn off * the reflink flag if we unshare all of the file's blocks. */ int xfs_reflink_unshare( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len) { struct inode *inode = VFS_I(ip); int error; if (!xfs_is_reflink_inode(ip)) return 0; trace_xfs_reflink_unshare(ip, offset, len); inode_dio_wait(inode); if (IS_DAX(inode)) error = dax_file_unshare(inode, offset, len, &xfs_dax_write_iomap_ops); else error = iomap_file_unshare(inode, offset, len, &xfs_buffered_write_iomap_ops); if (error) goto out; error = filemap_write_and_wait_range(inode->i_mapping, offset, offset + len - 1); if (error) goto out; /* Turn off the reflink flag if possible. */ error = xfs_reflink_try_clear_inode_flag(ip); if (error) goto out; return 0; out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } /* * Can we use reflink with this realtime extent size? Note that we don't check * for rblocks > 0 here because this can be called as part of attaching a new * rt section. */ bool xfs_reflink_supports_rextsize( struct xfs_mount *mp, unsigned int rextsize) { /* reflink on the realtime device requires rtgroups */ if (!xfs_has_rtgroups(mp)) return false; /* * Reflink doesn't support rt extent size larger than a single fsblock * because we would have to perform CoW-around for unaligned write * requests to guarantee that we always remap entire rt extents. */ if (rextsize != 1) return false; return true; }
28 10 28 19 16 24 24 24 24 3 5 1 1 1 1 1 18 19 10 11 14 8 1 5 19 24 521 514 11 12 11 12 34 34 34 34 34 3 24 45 26 10 15 15 372 343 7 24 23 502 254 252 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0 /* * Key setup facility for FS encryption support. * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #include <crypto/skcipher.h> #include <linux/random.h> #include "fscrypt_private.h" struct fscrypt_mode fscrypt_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, [FSCRYPT_MODE_AES_256_CTS] = { .friendly_name = "AES-256-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 32, .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, [FSCRYPT_MODE_AES_128_CTS] = { .friendly_name = "AES-128-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_SM4_XTS] = { .friendly_name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, }, [FSCRYPT_MODE_SM4_CTS] = { .friendly_name = "SM4-CBC-CTS", .cipher_str = "cts(cbc(sm4))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, [FSCRYPT_MODE_AES_256_HCTR2] = { .friendly_name = "AES-256-HCTR2", .cipher_str = "hctr2(aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1); if (S_ISREG(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); return ERR_PTR(-EINVAL); } /* Create a symmetric cipher object for the given encryption mode and key */ static struct crypto_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode) { struct crypto_skcipher *tfm; int err; tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fscrypt_warn(inode, "Missing crypto API support for %s (API name: \"%s\")", mode->friendly_name, mode->cipher_str); return ERR_PTR(-ENOPKG); } fscrypt_err(inode, "Error allocating '%s' transform: %ld", mode->cipher_str, PTR_ERR(tfm)); return tfm; } if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug * performance problems by logging the ->cra_driver_name the * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", mode->friendly_name, crypto_skcipher_driver_name(tfm)); } if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { err = -EINVAL; goto err_free_tfm; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); if (err) goto err_free_tfm; return tfm; err_free_tfm: crypto_free_skcipher(tfm); return ERR_PTR(err); } /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { struct crypto_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci); tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->tfm with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. Note that this concurrency is only * possible for per-mode keys, not for per-file keys. */ smp_store_release(&prep_key->tfm, tfm); return 0; } /* Destroy a crypto transform object and/or blk-crypto key. */ void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); fscrypt_destroy_inline_crypt_key(sb, prep_key); memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key) { ci->ci_owns_key = true; return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; struct fscrypt_prepared_key *prep_key; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; int err; if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; prep_key = &keys[mode_num]; if (fscrypt_is_key_prepared(prep_key, ci)) { ci->ci_enc_key = *prep_key; return 0; } mutex_lock(&fscrypt_mode_key_setup_mutex); if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); BUILD_BUG_ON(sizeof(hkdf_info) != 17); hkdf_info[hkdf_infolen++] = mode_num; if (include_fs_uuid) { memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) goto out_unlock; err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) goto out_unlock; done_unlock: ci->ci_enc_key = *prep_key; err = 0; out_unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); return err; } /* * Derive a SipHash key from the given fscrypt master key and the given * application-specific information string. * * Note that the KDF produces a byte array, but the SipHash APIs expect the key * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, u8 context, const u8 *info, unsigned int infolen, siphash_key_t *key) { int err; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, (u8 *)key, sizeof(*key)); if (err) return err; BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); return 0; } int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { int err; err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, &ci->ci_dirhash_key); if (err) return err; ci->ci_dirhash_key_initialized = true; return 0; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { WARN_ON_ONCE(ci->ci_inode->i_ino == 0); WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized); ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, &mk->mk_ino_hash_key); } static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk) { int err; err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); if (err) return err; /* pairs with smp_store_release() below */ if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { mutex_lock(&fscrypt_mode_key_setup_mutex); if (mk->mk_ino_hash_key_initialized) goto unlock; err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, &mk->mk_ino_hash_key); if (err) goto unlock; /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); if (err) return err; } /* * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, bool need_dirhash_key) { int err; if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* * DIRECT_KEY: instead of deriving per-file encryption keys, the * per-file nonce will be included in all the IVs. But unlike * v1 policies, for v2 policies in this case we don't encrypt * with the master key directly but rather derive a per-mode * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { /* * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline * encryption hardware compliant with the UFS standard. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, derived_key, ci->ci_mode->keysize); if (err) return err; err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } if (err) return err; /* Derive a secret dirhash key for directories that need it. */ if (need_dirhash_key) { err = fscrypt_derive_dirhash_key(ci, mk); if (err) return err; } return 0; } /* * Check whether the size of the given master key (@mk) is appropriate for the * encryption settings which a particular file will use (@ci). * * If the file uses a v1 encryption policy, then the master key must be at least * as long as the derived key, as this is a requirement of the v1 KDF. * * Otherwise, the KDF can accept any size key, so we enforce a slightly looser * requirement: we require that the size of the master key be at least the * maximum security strength of any algorithm whose key will be derived from it * (but in practice we only need to consider @ci->ci_mode, since any other * possible subkeys such as DIRHASH and INODE_HASH will never increase the * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be * derived from a 256-bit master key, which is cryptographically sufficient, * rather than requiring a 512-bit master key which is unnecessarily long. (We * still allow 512-bit master keys if the user chooses to use them, though.) */ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, const struct fscrypt_inode_info *ci) { unsigned int min_keysize; if (ci->ci_policy.version == FSCRYPT_POLICY_V1) min_keysize = ci->ci_mode->keysize; else min_keysize = ci->ci_mode->security_strength; if (mk->mk_secret.size < min_keysize) { fscrypt_warn(NULL, "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, mk->mk_secret.size, min_keysize); return false; } return true; } /* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then it is * returned in *mk_ret with its semaphore read-locked. This is needed to ensure * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes * (as multiple tasks may race to create an fscrypt_inode_info for the same * inode), and to synchronize the master key being removed with a new inode * starting to use it. */ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; err = fscrypt_select_encryption_impl(ci); if (err) return err; err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); if (unlikely(!mk)) { const union fscrypt_policy *dummy_policy = fscrypt_get_dummy_policy(sb); /* * Add the test_dummy_encryption key on-demand. In principle, * it should be added at mount time. Do it here instead so that * the individual filesystems don't need to worry about adding * this key at mount time and cleaning up on mount failure. */ if (dummy_policy && fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { err = fscrypt_add_test_dummy_key(sb, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); } } if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; /* * As a legacy fallback for v1 policies, search for the key in * the current task's subscribed keyrings too. Don't move this * to before the search of ->s_master_keys, since users * shouldn't be able to override filesystem-level keys. */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } down_read(&mk->mk_sem); if (!mk->mk_present) { /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */ err = -ENOKEY; goto out_release_key; } if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); break; case FSCRYPT_POLICY_V2: err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON_ONCE(1); err = -EINVAL; break; } if (err) goto out_release_key; *mk_ret = mk; return 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_inode_info *ci) { struct fscrypt_master_key *mk; if (!ci) return; if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, &ci->ci_enc_key); mk = ci->ci_master_key; if (mk) { /* * Remove this inode from the list of inodes that were unlocked * with the master key. In addition, if we're removing the last * inode from an incompletely removed key, then complete the * full removal of the key. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_inode_info_cachep, ci); } static int fscrypt_setup_encryption_info(struct inode *inode, const union fscrypt_policy *policy, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], bool need_dirhash_key) { struct fscrypt_inode_info *crypt_info; struct fscrypt_mode *mode; struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb); if (res) return res; crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; crypt_info->ci_policy = *policy; memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { res = PTR_ERR(mode); goto out; } WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; crypt_info->ci_data_unit_bits = fscrypt_policy_du_bits(&crypt_info->ci_policy, inode); crypt_info->ci_data_units_per_block_bits = inode->i_blkbits - crypt_info->ci_data_unit_bits; res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; /* * For existing inodes, multiple tasks may race to set ->i_crypt_info. * So use cmpxchg_release(). This pairs with the smp_load_acquire() in * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with * a RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { /* * We won the race and set ->i_crypt_info to our crypt_info. * Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); spin_unlock(&mk->mk_decrypted_inodes_lock); } crypt_info = NULL; } res = 0; out: if (mk) { up_read(&mk->mk_sem); fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; } /** * fscrypt_get_encryption_info() - set up an inode's encryption key * @inode: the inode to set up the key for. Must be encrypted. * @allow_unsupported: if %true, treat an unsupported encryption policy (or * unrecognized encryption context) the same way as the key * being unavailable, instead of returning an error. Use * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * * Set up ->i_crypt_info, if it hasn't already been done. * * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * * Return: 0 if ->i_crypt_info was set or was already set, *or* if the * encryption key is unavailable. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) { int res; union fscrypt_context ctx; union fscrypt_policy policy; if (fscrypt_has_encryption_key(inode)) return 0; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (res == -ERANGE && allow_unsupported) return 0; fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } res = fscrypt_policy_from_context(&policy, &ctx, res); if (res) { if (allow_unsupported) return 0; fscrypt_warn(inode, "Unrecognized or corrupt encryption context"); return res; } if (!fscrypt_supported_policy(&policy, inode)) { if (allow_unsupported) return 0; return -EINVAL; } res = fscrypt_setup_encryption_info(inode, &policy, fscrypt_context_nonce(&ctx), IS_CASEFOLDED(inode) && S_ISDIR(inode->i_mode)); if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */ res = 0; if (res == -ENOKEY) res = 0; return res; } /** * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory * @dir: a possibly-encrypted directory * @inode: the new inode. ->i_mode and ->i_blkbits must be set already. * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * * If the directory is encrypted, set up its ->i_crypt_info in preparation for * encrypting the name of the new file. Also, if the new inode will be * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino * isn't required to be set yet, as the filesystem may not have set it yet. * * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * * Return: 0 on success, -ENOKEY if the encryption key is missing, or another * -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { const union fscrypt_policy *policy; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) return 0; if (IS_ERR(policy)) return PTR_ERR(policy); if (WARN_ON_ONCE(inode->i_blkbits == 0)) return -EINVAL; if (WARN_ON_ONCE(inode->i_mode == 0)) return -EINVAL; /* * Only regular files, directories, and symlinks are encrypted. * Special files like device nodes and named pipes aren't. */ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return 0; *encrypt_ret = true; get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); return fscrypt_setup_encryption_info(inode, policy, nonce, IS_CASEFOLDED(dir) && S_ISDIR(inode->i_mode)); } EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); /** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * * Free the inode's fscrypt_inode_info. Filesystems must call this when the * inode is being evicted. An RCU grace period need not have elapsed yet. */ void fscrypt_put_encryption_info(struct inode *inode) { put_crypt_info(inode->i_crypt_info); inode->i_crypt_info = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); /** * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. */ void fscrypt_free_inode(struct inode *inode) { if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { kfree(inode->i_link); inode->i_link = NULL; } } EXPORT_SYMBOL(fscrypt_free_inode); /** * fscrypt_drop_inode() - check whether the inode's master key has been removed * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in * use and their master key has been removed. * * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0 */ int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode); /* * If ci is NULL, then the inode doesn't have an encryption key set up * so it's irrelevant. If ci_master_key is NULL, then the master key * was provided via the legacy mechanism of the process-subscribed * keyrings, so we don't know whether it's been removed or not. */ if (!ci || !ci->ci_master_key) return 0; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes * protected by the key were cleaned by sync_filesystem(). But if * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ if (inode->i_state & I_DIRTY_ALL) return 0; /* * We can't take ->mk_sem here, since this runs in atomic context. * Therefore, ->mk_present can change concurrently, and our result may * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ return !READ_ONCE(ci->ci_master_key->mk_present); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
127 126 126 58 65 4 8 54 54 4 36 33 8 8 53 53 53 53 52 53 53 52 20 44 55 34 44 24 53 1 2749 127 65 2 2 40 18 58 58 2903 2903 2902 8 2900 2875 51 127 20 127 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/vmalloc.h> #include <linux/raid/detect.h> #include "check.h" static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. */ #ifdef CONFIG_ACORN_PARTITION_ICS adfspart_check_ICS, #endif #ifdef CONFIG_ACORN_PARTITION_POWERTEC adfspart_check_POWERTEC, #endif #ifdef CONFIG_ACORN_PARTITION_EESOX adfspart_check_EESOX, #endif /* * Now move on to formats that only have partition info at * disk address 0xdc0. Since these may also have stale * PC/BIOS partition tables, they need to come before * the msdos entry. */ #ifdef CONFIG_ACORN_PARTITION_CUMANA adfspart_check_CUMANA, #endif #ifdef CONFIG_ACORN_PARTITION_ADFS adfspart_check_ADFS, #endif #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif #ifdef CONFIG_OF_PARTITION of_partition, /* cmdline have priority to OF */ #endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif #ifdef CONFIG_SGI_PARTITION sgi_partition, #endif #ifdef CONFIG_LDM_PARTITION ldm_partition, /* this must come before msdos */ #endif #ifdef CONFIG_MSDOS_PARTITION msdos_partition, #endif #ifdef CONFIG_OSF_PARTITION osf_partition, #endif #ifdef CONFIG_SUN_PARTITION sun_partition, #endif #ifdef CONFIG_AMIGA_PARTITION amiga_partition, #endif #ifdef CONFIG_ATARI_PARTITION atari_partition, #endif #ifdef CONFIG_MAC_PARTITION mac_partition, #endif #ifdef CONFIG_ULTRIX_PARTITION ultrix_partition, #endif #ifdef CONFIG_IBM_PARTITION ibm_partition, #endif #ifdef CONFIG_KARMA_PARTITION karma_partition, #endif #ifdef CONFIG_SYSV68_PARTITION sysv68_partition, #endif NULL }; static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); return NULL; } state->limit = nr; return state; } static void free_partitions(struct parsed_partitions *state) { vfree(state->parts); kfree(state); } static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; state = allocate_partitions(hd); if (!state) return NULL; state->pp_buf = (char *)__get_free_page(GFP_KERNEL); if (!state->pp_buf) { free_partitions(state); return NULL; } state->pp_buf[0] = '\0'; state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); i = res = err = 0; while (!res && check_part[i]) { memset(state->parts, 0, state->limit * sizeof(state->parts[0])); res = check_part[i++](state); if (res < 0) { /* * We have hit an I/O error which we don't report now. * But record it, and let the others do their job. */ err = res; res = 0; } } if (res > 0) { printk(KERN_INFO "%s", state->pp_buf); free_page((unsigned long)state->pp_buf); return state; } if (state->access_beyond_eod) err = -ENOSPC; /* * The partition is unrecognized. So report I/O errors if there were any */ if (err) res = err; if (res) { strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } free_page((unsigned long)state->pp_buf); free_partitions(state); return ERR_PTR(res); } static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif NULL }; static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif NULL }; static void part_release(struct device *dev) { put_disk(dev_to_bdev(dev)->bd_disk); bdev_drop(dev_to_bdev(dev)); } static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct block_device *part = dev_to_bdev(dev); add_uevent_var(env, "PARTN=%u", bdev_partno(part)); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); if (part->bd_meta_info && part->bd_meta_info->uuid[0]) add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); return 0; } const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); xa_erase(&part->bd_disk->part_tbl, bdev_partno(part)); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; const char *dname; int err; lockdep_assert_held(&disk->open_mutex); if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* * Partitions are not supported on zoned block devices that are used as * such. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); } if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); /* ensure we always have a reference to the whole disk */ get_device(disk_to_dev(disk)); err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); else dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; pdev->type = &part_type; pdev->parent = ddev; /* in consecutive minor range? */ if (bdev_partno(bdev) < disk->minors) { devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev)); } else { err = blk_alloc_ext_minor(); if (err < 0) goto out_put; devt = MKDEV(BLOCK_EXT_MAJOR, err); } pdev->devt = devt; if (info) { err = -ENOMEM; bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); if (!bdev->bd_meta_info) goto out_put; } /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); if (err) goto out_put; err = -ENOMEM; bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(pdev, &dev_attr_whole_disk); if (err) goto out_del; } if (flags & ADDPART_FLAG_READONLY) bdev_set_flag(bdev, BD_READ_ONLY); /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) goto out_del; bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct block_device *part; bool overlap = false; unsigned long idx; rcu_read_lock(); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (bdev_partno(part) != skip_partno && start < part->bd_start_sect + bdev_nr_sectors(part) && start + length > part->bd_start_sect) { overlap = true; break; } } rcu_read_unlock(); return overlap; } int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part; int ret; mutex_lock(&disk->open_mutex); if (!disk_live(disk)) { ret = -ENXIO; goto out; } if (disk->flags & GENHD_FL_NO_PART) { ret = -EINVAL; goto out; } if (partition_overlaps(disk, start, length, -1)) { ret = -EBUSY; goto out; } part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); ret = PTR_ERR_OR_ZERO(part); out: mutex_unlock(&disk->open_mutex); return ret; } int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EBUSY; if (atomic_read(&part->bd_openers)) goto out_unlock; /* * We verified that @part->bd_openers is zero above and so * @part->bd_holder{_ops} can't be set. And since we hold * @disk->open_mutex the device can't be claimed by anyone. * * So no need to call @part->bd_holder_ops->mark_dead() here. * Just delete the partition and invalidate it. */ bdev_unhash(part); invalidate_bdev(part); drop_partition(part); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } static bool disk_unlock_native_capacity(struct gendisk *disk) { if (!disk->fops->unlock_native_capacity || test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } printk(KERN_CONT "enabling native capacity\n"); disk->fops->unlock_native_capacity(disk); return true; } static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; struct block_device *part; if (!size) return true; if (from >= get_capacity(disk)) { printk(KERN_WARNING "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); if (disk_unlock_native_capacity(disk)) return false; return true; } if (from + size > get_capacity(disk)) { printk(KERN_WARNING "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); if (disk_unlock_native_capacity(disk)) return false; /* * We can not ignore partitions of broken tables created by for * example camera firmware, but we limit them to the end of the * disk to avoid creating invalid block devices. */ size = get_capacity(disk) - from; } part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part)) { if (PTR_ERR(part) != -ENXIO) { printk(KERN_ERR " %s: p%d could not be added: %pe\n", disk->disk_name, p, part); } return true; } if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part->bd_dev); return true; } static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { /* * I/O error reading the partition table. If we tried to read * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) return -EAGAIN; } return -EIO; } /* * Partitions are not supported on host managed zoned block devices. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; goto out_free_state; } /* * If we read beyond EOD, try unlocking native capacity even if the * partition table was successfully read as we could be missing some * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; out_free_state: free_partitions(state); return ret; } int bdev_disk_changed(struct gendisk *disk, bool invalidate) { struct block_device *part; unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); if (!disk_live(disk)) return -ENXIO; rescan: if (disk->open_partitions) return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); xa_for_each_start(&disk->part_tbl, idx, part, 1) { /* * Remove the block device from the inode hash, so that * it cannot be looked up any more even when openers * still hold references. */ bdev_unhash(part); /* * If @disk->open_partitions isn't elevated but there's * still an active holder of that block device things * are broken. */ WARN_ON_ONCE(atomic_read(&part->bd_openers)); invalidate_bdev(part); drop_partition(part); } clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } if (get_capacity(disk)) { ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->disk->part0->bd_mapping; struct folio *folio; if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; goto out; } folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); if (IS_ERR(folio)) goto out; p->v = folio; return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner */ /* * UBI wear-leveling sub-system. * * This sub-system is responsible for wear-leveling. It works in terms of * physical eraseblocks and erase counters and knows nothing about logical * eraseblocks, volumes, etc. From this sub-system's perspective all physical * eraseblocks are of two types - used and free. Used physical eraseblocks are * those that were "get" by the 'ubi_wl_get_peb()' function, and free physical * eraseblocks are those that were put by the 'ubi_wl_put_peb()' function. * * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter * header. The rest of the physical eraseblock contains only %0xFF bytes. * * When physical eraseblocks are returned to the WL sub-system by means of the * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is * done asynchronously in context of the per-UBI device background thread, * which is also managed by the WL sub-system. * * The wear-leveling is ensured by means of moving the contents of used * physical eraseblocks with low erase counter to free physical eraseblocks * with high erase counter. * * If the WL sub-system fails to erase a physical eraseblock, it marks it as * bad. * * This sub-system is also responsible for scrubbing. If a bit-flip is detected * in a physical eraseblock, it has to be moved. Technically this is the same * as moving it for wear-leveling reasons. * * As it was said, for the UBI sub-system all physical eraseblocks are either * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while * used eraseblocks are kept in @wl->used, @wl->erroneous, or @wl->scrub * RB-trees, as well as (temporarily) in the @wl->pq queue. * * When the WL sub-system returns a physical eraseblock, the physical * eraseblock is protected from being moved for some "time". For this reason, * the physical eraseblock is not directly moved from the @wl->free tree to the * @wl->used tree. There is a protection queue in between where this * physical eraseblock is temporarily stored (@wl->pq). * * All this protection stuff is needed because: * o we don't want to move physical eraseblocks just after we have given them * to the user; instead, we first want to let users fill them up with data; * * o there is a chance that the user will put the physical eraseblock very * soon, so it makes sense not to move it for some time, but wait. * * Physical eraseblocks stay protected only for limited time. But the "time" is * measured in erase cycles in this case. This is implemented with help of the * protection queue. Eraseblocks are put to the tail of this queue when they * are returned by the 'ubi_wl_get_peb()', and eraseblocks are removed from the * head of the queue on each erase operation (for any eraseblock). So the * length of the queue defines how may (global) erase cycles PEBs are protected. * * To put it differently, each physical eraseblock has 2 main states: free and * used. The former state corresponds to the @wl->free tree. The latter state * is split up on several sub-states: * o the WL movement is allowed (@wl->used tree); * o the WL movement is disallowed (@wl->erroneous) because the PEB is * erroneous - e.g., there was a read error; * o the WL movement is temporarily prohibited (@wl->pq queue); * o scrubbing is needed (@wl->scrub tree). * * Depending on the sub-state, wear-leveling entries of the used physical * eraseblocks may be kept in one of those structures. * * Note, in this implementation, we keep a small in-RAM object for each physical * eraseblock. This is surely not a scalable solution. But it appears to be good * enough for moderately large flashes and it is simple. In future, one may * re-work this sub-system and make it more scalable. * * At the moment this sub-system does not utilize the sequence number, which * was introduced relatively recently. But it would be wise to do this because * the sequence number of a logical eraseblock characterizes how old is it. For * example, when we move a PEB with low erase counter, and we need to pick the * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we * pick target PEB with an average EC if our PEB is not very "old". This is a * room for future re-works of the WL sub-system. */ #include <linux/slab.h> #include <linux/crc32.h> #include <linux/freezer.h> #include <linux/kthread.h> #include "ubi.h" #include "wl.h" /* Number of physical eraseblocks reserved for wear-leveling purposes */ #define WL_RESERVED_PEBS 1 /* * Maximum difference between two erase counters. If this threshold is * exceeded, the WL sub-system starts moving data from used physical * eraseblocks with low erase counter to free physical eraseblocks with high * erase counter. */ #define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD /* * When a physical eraseblock is moved, the WL sub-system has to pick the target * physical eraseblock to move to. The simplest way would be just to pick the * one with the highest erase counter. But in certain workloads this could lead * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a * situation when the picked physical eraseblock is constantly erased after the * data is written to it. So, we have a constant which limits the highest erase * counter of the free physical eraseblock to pick. Namely, the WL sub-system * does not pick eraseblocks with erase counter greater than the lowest erase * counter plus %WL_FREE_MAX_DIFF. */ #define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) /* * Maximum number of consecutive background thread failures which is enough to * switch to read-only mode. */ #define WL_MAX_FAILURES 32 static int self_check_ec(struct ubi_device *ubi, int pnum, int ec); static int self_check_in_wl_tree(const struct ubi_device *ubi, struct ubi_wl_entry *e, struct rb_root *root); static int self_check_in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e); /** * wl_tree_add - add a wear-leveling entry to a WL RB-tree. * @e: the wear-leveling entry to add * @root: the root of the tree * * Note, we use (erase counter, physical eraseblock number) pairs as keys in * the @ubi->used and @ubi->free RB-trees. */ static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root) { struct rb_node **p, *parent = NULL; p = &root->rb_node; while (*p) { struct ubi_wl_entry *e1; parent = *p; e1 = rb_entry(parent, struct ubi_wl_entry, u.rb); if (e->ec < e1->ec) p = &(*p)->rb_left; else if (e->ec > e1->ec) p = &(*p)->rb_right; else { ubi_assert(e->pnum != e1->pnum); if (e->pnum < e1->pnum) p = &(*p)->rb_left; else p = &(*p)->rb_right; } } rb_link_node(&e->u.rb, parent, p); rb_insert_color(&e->u.rb, root); } /** * wl_entry_destroy - destroy a wear-leveling entry. * @ubi: UBI device description object * @e: the wear-leveling entry to add * * This function destroys a wear leveling entry and removes * the reference from the lookup table. */ static void wl_entry_destroy(struct ubi_device *ubi, struct ubi_wl_entry *e) { ubi->lookuptbl[e->pnum] = NULL; kmem_cache_free(ubi_wl_entry_slab, e); } /** * do_work - do one pending work. * @ubi: UBI device description object * @executed: whether there is one work is executed * * This function returns zero in case of success and a negative error code in * case of failure. If @executed is not NULL and there is one work executed, * @executed is set as %1, otherwise @executed is set as %0. */ static int do_work(struct ubi_device *ubi, int *executed) { int err; struct ubi_work *wrk; cond_resched(); /* * @ubi->work_sem is used to synchronize with the workers. Workers take * it in read mode, so many of them may be doing works at a time. But * the queue flush code has to be sure the whole queue of works is * done, and it takes the mutex in write mode. */ down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works)) { spin_unlock(&ubi->wl_lock); up_read(&ubi->work_sem); if (executed) *executed = 0; return 0; } if (executed) *executed = 1; wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); /* * Call the worker function. Do not touch the work structure * after this call as it will have been freed or reused by that * time by the worker function. */ err = wrk->func(ubi, wrk, 0); if (err) ubi_err(ubi, "work failed with error code %d", err); up_read(&ubi->work_sem); return err; } /** * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree. * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns non-zero if @e is in the @root RB-tree and zero if it * is not. */ static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root) { struct rb_node *p; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e->pnum == e1->pnum) { ubi_assert(e == e1); return 1; } if (e->ec < e1->ec) p = p->rb_left; else if (e->ec > e1->ec) p = p->rb_right; else { ubi_assert(e->pnum != e1->pnum); if (e->pnum < e1->pnum) p = p->rb_left; else p = p->rb_right; } } return 0; } /** * in_pq - check if a wear-leveling entry is present in the protection queue. * @ubi: UBI device description object * @e: the wear-leveling entry to check * * This function returns non-zero if @e is in the protection queue and zero * if it is not. */ static inline int in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e) { struct ubi_wl_entry *p; int i; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) list_for_each_entry(p, &ubi->pq[i], u.list) if (p == e) return 1; return 0; } /** * prot_queue_add - add physical eraseblock to the protection queue. * @ubi: UBI device description object * @e: the physical eraseblock to add * * This function adds @e to the tail of the protection queue @ubi->pq, where * @e will stay for %UBI_PROT_QUEUE_LEN erase operations and will be * temporarily protected from the wear-leveling worker. Note, @wl->lock has to * be locked. */ static void prot_queue_add(struct ubi_device *ubi, struct ubi_wl_entry *e) { int pq_tail = ubi->pq_head - 1; if (pq_tail < 0) pq_tail = UBI_PROT_QUEUE_LEN - 1; ubi_assert(pq_tail >= 0 && pq_tail < UBI_PROT_QUEUE_LEN); list_add_tail(&e->u.list, &ubi->pq[pq_tail]); dbg_wl("added PEB %d EC %d to the protection queue", e->pnum, e->ec); } /** * find_wl_entry - find wear-leveling entry closest to certain erase counter. * @ubi: UBI device description object * @root: the RB-tree where to look for * @diff: maximum possible difference from the smallest erase counter * @pick_max: pick PEB even its erase counter beyonds 'min_ec + @diff' * * This function looks for a wear leveling entry with erase counter closest to * min + @diff, where min is the smallest erase counter. */ static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi, struct rb_root *root, int diff, int pick_max) { struct rb_node *p; struct ubi_wl_entry *e; int max; e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); max = e->ec + diff; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e1->ec >= max) { if (pick_max) e = e1; p = p->rb_left; } else { p = p->rb_right; e = e1; } } return e; } /** * find_mean_wl_entry - find wear-leveling entry with medium erase counter. * @ubi: UBI device description object * @root: the RB-tree where to look for * * This function looks for a wear leveling entry with medium erase counter, * but not greater or equivalent than the lowest erase counter plus * %WL_FREE_MAX_DIFF/2. */ static struct ubi_wl_entry *find_mean_wl_entry(struct ubi_device *ubi, struct rb_root *root) { struct ubi_wl_entry *e, *first, *last; first = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); last = rb_entry(rb_last(root), struct ubi_wl_entry, u.rb); if (last->ec - first->ec < WL_FREE_MAX_DIFF) { e = rb_entry(root->rb_node, struct ubi_wl_entry, u.rb); /* * If no fastmap has been written and fm_anchor is not * reserved and this WL entry can be used as anchor PEB * hold it back and return the second best WL entry such * that fastmap can use the anchor PEB later. */ e = may_reserve_for_fm(ubi, e, root); } else e = find_wl_entry(ubi, root, WL_FREE_MAX_DIFF/2, 0); return e; } /** * wl_get_wle - get a mean wl entry to be used by ubi_wl_get_peb() or * refill_wl_user_pool(). * @ubi: UBI device description object * * This function returns a wear leveling entry in case of success and * NULL in case of failure. */ static struct ubi_wl_entry *wl_get_wle(struct ubi_device *ubi) { struct ubi_wl_entry *e; e = find_mean_wl_entry(ubi, &ubi->free); if (!e) { ubi_err(ubi, "no free eraseblocks"); return NULL; } self_check_in_wl_tree(ubi, e, &ubi->free); /* * Move the physical eraseblock to the protection queue where it will * be protected from being moved for some time. */ rb_erase(&e->u.rb, &ubi->free); ubi->free_count--; dbg_wl("PEB %d EC %d", e->pnum, e->ec); return e; } /** * prot_queue_del - remove a physical eraseblock from the protection queue. * @ubi: UBI device description object * @pnum: the physical eraseblock to remove * * This function deletes PEB @pnum from the protection queue and returns zero * in case of success and %-ENODEV if the PEB was not found. */ static int prot_queue_del(struct ubi_device *ubi, int pnum) { struct ubi_wl_entry *e; e = ubi->lookuptbl[pnum]; if (!e) return -ENODEV; if (self_check_in_pq(ubi, e)) return -ENODEV; list_del(&e->u.list); dbg_wl("deleted PEB %d from the protection queue", e->pnum); return 0; } /** * ubi_sync_erase - synchronously erase a physical eraseblock. * @ubi: UBI device description object * @e: the physical eraseblock to erase * @torture: if the physical eraseblock has to be tortured * * This function returns zero in case of success and a negative error code in * case of failure. */ int ubi_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture) { int err; struct ubi_ec_hdr *ec_hdr; unsigned long long ec = e->ec; dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec); err = self_check_ec(ubi, e->pnum, e->ec); if (err) return -EINVAL; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_sync_erase(ubi, e->pnum, torture); if (err < 0) goto out_free; ec += err; if (ec > UBI_MAX_ERASECOUNTER) { /* * Erase counter overflow. Upgrade UBI and use 64-bit * erase counters internally. */ ubi_err(ubi, "erase counter overflow at PEB %d, EC %llu", e->pnum, ec); err = -EINVAL; goto out_free; } dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec); ec_hdr->ec = cpu_to_be64(ec); err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr); if (err) goto out_free; e->ec = ec; spin_lock(&ubi->wl_lock); if (e->ec > ubi->max_ec) ubi->max_ec = e->ec; spin_unlock(&ubi->wl_lock); out_free: kfree(ec_hdr); return err; } /** * serve_prot_queue - check if it is time to stop protecting PEBs. * @ubi: UBI device description object * * This function is called after each erase operation and removes PEBs from the * tail of the protection queue. These PEBs have been protected for long enough * and should be moved to the used tree. */ static void serve_prot_queue(struct ubi_device *ubi) { struct ubi_wl_entry *e, *tmp; int count; /* * There may be several protected physical eraseblock to remove, * process them all. */ repeat: count = 0; spin_lock(&ubi->wl_lock); list_for_each_entry_safe(e, tmp, &ubi->pq[ubi->pq_head], u.list) { dbg_wl("PEB %d EC %d protection over, move to used tree", e->pnum, e->ec); list_del(&e->u.list); wl_tree_add(e, &ubi->used); if (count++ > 32) { /* * Let's be nice and avoid holding the spinlock for * too long. */ spin_unlock(&ubi->wl_lock); cond_resched(); goto repeat; } } ubi->pq_head += 1; if (ubi->pq_head == UBI_PROT_QUEUE_LEN) ubi->pq_head = 0; ubi_assert(ubi->pq_head >= 0 && ubi->pq_head < UBI_PROT_QUEUE_LEN); spin_unlock(&ubi->wl_lock); } /** * __schedule_ubi_work - schedule a work. * @ubi: UBI device description object * @wrk: the work to schedule * * This function adds a work defined by @wrk to the tail of the pending works * list. Can only be used if ubi->work_sem is already held in read mode! */ static void __schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { spin_lock(&ubi->wl_lock); list_add_tail(&wrk->list, &ubi->works); ubi_assert(ubi->works_count >= 0); ubi->works_count += 1; if (ubi->thread_enabled && !ubi_dbg_is_bgt_disabled(ubi)) wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); } /** * schedule_ubi_work - schedule a work. * @ubi: UBI device description object * @wrk: the work to schedule * * This function adds a work defined by @wrk to the tail of the pending works * list. */ static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { down_read(&ubi->work_sem); __schedule_ubi_work(ubi, wrk); up_read(&ubi->work_sem); } static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, int shutdown); /** * schedule_erase - schedule an erase work. * @ubi: UBI device description object * @e: the WL entry of the physical eraseblock to erase * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured * @nested: denotes whether the work_sem is already held * * This function returns zero in case of success and a %-ENOMEM in case of * failure. */ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int vol_id, int lnum, int torture, bool nested) { struct ubi_work *wl_wrk; ubi_assert(e); dbg_wl("schedule erasure of PEB %d, EC %d, torture %d", e->pnum, e->ec, torture); wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS); if (!wl_wrk) return -ENOMEM; wl_wrk->func = &erase_worker; wl_wrk->e = e; wl_wrk->vol_id = vol_id; wl_wrk->lnum = lnum; wl_wrk->torture = torture; if (nested) __schedule_ubi_work(ubi, wl_wrk); else schedule_ubi_work(ubi, wl_wrk); return 0; } static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk); /** * do_sync_erase - run the erase worker synchronously. * @ubi: UBI device description object * @e: the WL entry of the physical eraseblock to erase * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured * */ static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int vol_id, int lnum, int torture) { struct ubi_work wl_wrk; dbg_wl("sync erase of PEB %i", e->pnum); wl_wrk.e = e; wl_wrk.vol_id = vol_id; wl_wrk.lnum = lnum; wl_wrk.torture = torture; return __erase_worker(ubi, &wl_wrk); } static int ensure_wear_leveling(struct ubi_device *ubi, int nested); /** * wear_leveling_worker - wear-leveling worker function. * @ubi: UBI device description object * @wrk: the work object * @shutdown: non-zero if the worker has to free memory and exit * because the WL-subsystem is shutting down * * This function copies a more worn out physical eraseblock to a less worn out * one. Returns zero in case of success and a negative error code in case of * failure. */ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, int shutdown) { int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0; int erase = 0, keep = 0, vol_id = -1, lnum = -1; struct ubi_wl_entry *e1, *e2; struct ubi_vid_io_buf *vidb; struct ubi_vid_hdr *vid_hdr; int dst_leb_clean = 0; kfree(wrk); if (shutdown) return 0; vidb = ubi_alloc_vid_buf(ubi, GFP_NOFS); if (!vidb) return -ENOMEM; vid_hdr = ubi_get_vid_hdr(vidb); down_read(&ubi->fm_eba_sem); mutex_lock(&ubi->move_mutex); spin_lock(&ubi->wl_lock); ubi_assert(!ubi->move_from && !ubi->move_to); ubi_assert(!ubi->move_to_put); #ifdef CONFIG_MTD_UBI_FASTMAP if (!next_peb_for_wl(ubi, true) || #else if (!ubi->free.rb_node || #endif (!ubi->used.rb_node && !ubi->scrub.rb_node)) { /* * No free physical eraseblocks? Well, they must be waiting in * the queue to be erased. Cancel movement - it will be * triggered again when a free physical eraseblock appears. * * No used physical eraseblocks? They must be temporarily * protected from being moved. They will be moved to the * @ubi->used tree later and the wear-leveling will be * triggered again. */ dbg_wl("cancel WL, a list is empty: free %d, used %d", !ubi->free.rb_node, !ubi->used.rb_node); goto out_cancel; } #ifdef CONFIG_MTD_UBI_FASTMAP e1 = find_anchor_wl_entry(&ubi->used); if (e1 && ubi->fm_anchor && (ubi->fm_anchor->ec - e1->ec >= UBI_WL_THRESHOLD)) { ubi->fm_do_produce_anchor = 1; /* * fm_anchor is no longer considered a good anchor. * NULL assignment also prevents multiple wear level checks * of this PEB. */ wl_tree_add(ubi->fm_anchor, &ubi->free); ubi->fm_anchor = NULL; ubi->free_count++; } if (ubi->fm_do_produce_anchor) { if (!e1) goto out_cancel; e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; self_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("anchor-move PEB %d to PEB %d", e1->pnum, e2->pnum); ubi->fm_do_produce_anchor = 0; } else if (!ubi->scrub.rb_node) { #else if (!ubi->scrub.rb_node) { #endif /* * Now pick the least worn-out used physical eraseblock and a * highly worn-out free physical eraseblock. If the erase * counters differ much enough, start wear-leveling. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) { dbg_wl("no WL needed: min used EC %d, max free EC %d", e1->ec, e2->ec); /* Give the unused PEB back */ wl_tree_add(e2, &ubi->free); ubi->free_count++; goto out_cancel; } self_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("move PEB %d EC %d to PEB %d EC %d", e1->pnum, e1->ec, e2->pnum, e2->ec); } else { /* Perform scrubbing */ scrubbing = 1; e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb); e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; self_check_in_wl_tree(ubi, e1, &ubi->scrub); rb_erase(&e1->u.rb, &ubi->scrub); dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); } ubi->move_from = e1; ubi->move_to = e2; spin_unlock(&ubi->wl_lock); /* * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum. * We so far do not know which logical eraseblock our physical * eraseblock (@e1) belongs to. We have to read the volume identifier * header first. * * Note, we are protected from this PEB being unmapped and erased. The * 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB * which is being moved was unmapped. */ err = ubi_io_read_vid_hdr(ubi, e1->pnum, vidb, 0); if (err && err != UBI_IO_BITFLIPS) { dst_leb_clean = 1; if (err == UBI_IO_FF) { /* * We are trying to move PEB without a VID header. UBI * always write VID headers shortly after the PEB was * given, so we have a situation when it has not yet * had a chance to write it, because it was preempted. * So add this PEB to the protection queue so far, * because presumably more data will be written there * (including the missing VID header), and then we'll * move it. */ dbg_wl("PEB %d has no VID header", e1->pnum); protect = 1; goto out_not_moved; } else if (err == UBI_IO_FF_BITFLIPS) { /* * The same situation as %UBI_IO_FF, but bit-flips were * detected. It is better to schedule this PEB for * scrubbing. */ dbg_wl("PEB %d has no VID header but has bit-flips", e1->pnum); scrubbing = 1; goto out_not_moved; } else if (ubi->fast_attach && err == UBI_IO_BAD_HDR_EBADMSG) { /* * While a full scan would detect interrupted erasures * at attach time we can face them here when attached from * Fastmap. */ dbg_wl("PEB %d has ECC errors, maybe from an interrupted erasure", e1->pnum); erase = 1; goto out_not_moved; } ubi_err(ubi, "error %d while reading VID header from PEB %d", err, e1->pnum); goto out_error; } vol_id = be32_to_cpu(vid_hdr->vol_id); lnum = be32_to_cpu(vid_hdr->lnum); err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vidb); if (err) { if (err == MOVE_CANCEL_RACE) { /* * The LEB has not been moved because the volume is * being deleted or the PEB has been put meanwhile. We * should prevent this PEB from being selected for * wear-leveling movement again, so put it to the * protection queue. */ protect = 1; dst_leb_clean = 1; goto out_not_moved; } if (err == MOVE_RETRY) { /* * For source PEB: * 1. The scrubbing is set for scrub type PEB, it will * be put back into ubi->scrub list. * 2. Non-scrub type PEB will be put back into ubi->used * list. */ keep = 1; dst_leb_clean = 1; goto out_not_moved; } if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR || err == MOVE_TARGET_RD_ERR) { /* * Target PEB had bit-flips or write error - torture it. */ torture = 1; keep = 1; goto out_not_moved; } if (err == MOVE_SOURCE_RD_ERR) { /* * An error happened while reading the source PEB. Do * not switch to R/O mode in this case, and give the * upper layers a possibility to recover from this, * e.g. by unmapping corresponding LEB. Instead, just * put this PEB to the @ubi->erroneous list to prevent * UBI from trying to move it over and over again. */ if (ubi->erroneous_peb_count > ubi->max_erroneous) { ubi_err(ubi, "too many erroneous eraseblocks (%d)", ubi->erroneous_peb_count); goto out_error; } dst_leb_clean = 1; erroneous = 1; goto out_not_moved; } if (err < 0) goto out_error; ubi_assert(0); } /* The PEB has been successfully moved */ if (scrubbing) ubi_msg(ubi, "scrubbed PEB %d (LEB %d:%d), data moved to PEB %d", e1->pnum, vol_id, lnum, e2->pnum); ubi_free_vid_buf(vidb); spin_lock(&ubi->wl_lock); if (!ubi->move_to_put) { wl_tree_add(e2, &ubi->used); e2 = NULL; } ubi->move_from = ubi->move_to = NULL; ubi->move_to_put = ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); err = do_sync_erase(ubi, e1, vol_id, lnum, 0); if (err) { if (e2) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e2); spin_unlock(&ubi->wl_lock); } goto out_ro; } if (e2) { /* * Well, the target PEB was put meanwhile, schedule it for * erasure. */ dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase", e2->pnum, vol_id, lnum); err = do_sync_erase(ubi, e2, vol_id, lnum, 0); if (err) goto out_ro; } dbg_wl("done"); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); return 0; /* * For some reasons the LEB was not moved, might be an error, might be * something else. @e1 was not changed, so return it back. @e2 might * have been changed, schedule it for erasure. */ out_not_moved: if (vol_id != -1) dbg_wl("cancel moving PEB %d (LEB %d:%d) to PEB %d (%d)", e1->pnum, vol_id, lnum, e2->pnum, err); else dbg_wl("cancel moving PEB %d to PEB %d (%d)", e1->pnum, e2->pnum, err); spin_lock(&ubi->wl_lock); if (protect) prot_queue_add(ubi, e1); else if (erroneous) { wl_tree_add(e1, &ubi->erroneous); ubi->erroneous_peb_count += 1; } else if (scrubbing) wl_tree_add(e1, &ubi->scrub); else if (keep) wl_tree_add(e1, &ubi->used); if (dst_leb_clean) { wl_tree_add(e2, &ubi->free); ubi->free_count++; } ubi_assert(!ubi->move_to_put); ubi->move_from = ubi->move_to = NULL; ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); ubi_free_vid_buf(vidb); if (dst_leb_clean) { ensure_wear_leveling(ubi, 1); } else { err = do_sync_erase(ubi, e2, vol_id, lnum, torture); if (err) goto out_ro; } if (erase) { err = do_sync_erase(ubi, e1, vol_id, lnum, 1); if (err) goto out_ro; } mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); return 0; out_error: if (vol_id != -1) ubi_err(ubi, "error %d while moving PEB %d to PEB %d", err, e1->pnum, e2->pnum); else ubi_err(ubi, "error %d while moving PEB %d (LEB %d:%d) to PEB %d", err, e1->pnum, vol_id, lnum, e2->pnum); spin_lock(&ubi->wl_lock); ubi->move_from = ubi->move_to = NULL; ubi->move_to_put = ubi->wl_scheduled = 0; wl_entry_destroy(ubi, e1); wl_entry_destroy(ubi, e2); spin_unlock(&ubi->wl_lock); ubi_free_vid_buf(vidb); out_ro: ubi_ro_mode(ubi); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); ubi_assert(err != 0); return err < 0 ? err : -EIO; out_cancel: ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); ubi_free_vid_buf(vidb); return 0; } /** * ensure_wear_leveling - schedule wear-leveling if it is needed. * @ubi: UBI device description object * @nested: set to non-zero if this function is called from UBI worker * * This function checks if it is time to start wear-leveling and schedules it * if yes. This function returns zero in case of success and a negative error * code in case of failure. */ static int ensure_wear_leveling(struct ubi_device *ubi, int nested) { int err = 0; struct ubi_work *wrk; spin_lock(&ubi->wl_lock); if (ubi->wl_scheduled) /* Wear-leveling is already in the work queue */ goto out_unlock; /* * If the ubi->scrub tree is not empty, scrubbing is needed, and the * WL worker has to be scheduled anyway. */ if (!ubi->scrub.rb_node) { #ifdef CONFIG_MTD_UBI_FASTMAP if (!need_wear_leveling(ubi)) goto out_unlock; #else struct ubi_wl_entry *e1; struct ubi_wl_entry *e2; if (!ubi->used.rb_node || !ubi->free.rb_node) /* No physical eraseblocks - no deal */ goto out_unlock; /* * We schedule wear-leveling only if the difference between the * lowest erase counter of used physical eraseblocks and a high * erase counter of free physical eraseblocks is greater than * %UBI_WL_THRESHOLD. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF, 0); if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) goto out_unlock; #endif dbg_wl("schedule wear-leveling"); } else dbg_wl("schedule scrubbing"); ubi->wl_scheduled = 1; spin_unlock(&ubi->wl_lock); wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS); if (!wrk) { err = -ENOMEM; goto out_cancel; } wrk->func = &wear_leveling_worker; if (nested) __schedule_ubi_work(ubi, wrk); else schedule_ubi_work(ubi, wrk); return err; out_cancel: spin_lock(&ubi->wl_lock); ubi->wl_scheduled = 0; out_unlock: spin_unlock(&ubi->wl_lock); return err; } /** * __erase_worker - physical eraseblock erase worker function. * @ubi: UBI device description object * @wl_wrk: the work object * * This function erases a physical eraseblock and perform torture testing if * needed. It also takes care about marking the physical eraseblock bad if * needed. Returns zero in case of success and a negative error code in case of * failure. */ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) { struct ubi_wl_entry *e = wl_wrk->e; int pnum = e->pnum; int vol_id = wl_wrk->vol_id; int lnum = wl_wrk->lnum; int err, available_consumed = 0; dbg_wl("erase PEB %d EC %d LEB %d:%d", pnum, e->ec, wl_wrk->vol_id, wl_wrk->lnum); err = ubi_sync_erase(ubi, e, wl_wrk->torture); if (!err) { spin_lock(&ubi->wl_lock); if (!ubi->fm_disabled && !ubi->fm_anchor && e->pnum < UBI_FM_MAX_START) { /* * Abort anchor production, if needed it will be * enabled again in the wear leveling started below. */ ubi->fm_anchor = e; ubi->fm_do_produce_anchor = 0; } else { wl_tree_add(e, &ubi->free); ubi->free_count++; } spin_unlock(&ubi->wl_lock); /* * One more erase operation has happened, take care about * protected physical eraseblocks. */ serve_prot_queue(ubi); /* And take care about wear-leveling */ err = ensure_wear_leveling(ubi, 1); return err; } ubi_err(ubi, "failed to erase PEB %d, error %d", pnum, err); if (err == -EINTR || err == -ENOMEM || err == -EAGAIN || err == -EBUSY) { int err1; /* Re-schedule the LEB for erasure */ err1 = schedule_erase(ubi, e, vol_id, lnum, 0, true); if (err1) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); spin_unlock(&ubi->wl_lock); err = err1; goto out_ro; } return err; } spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); spin_unlock(&ubi->wl_lock); if (err != -EIO) /* * If this is not %-EIO, we have no idea what to do. Scheduling * this physical eraseblock for erasure again would cause * errors again and again. Well, lets switch to R/O mode. */ goto out_ro; /* It is %-EIO, the PEB went bad */ if (!ubi->bad_allowed) { ubi_err(ubi, "bad physical eraseblock %d detected", pnum); goto out_ro; } spin_lock(&ubi->volumes_lock); if (ubi->beb_rsvd_pebs == 0) { if (ubi->avail_pebs == 0) { spin_unlock(&ubi->volumes_lock); ubi_err(ubi, "no reserved/available physical eraseblocks"); goto out_ro; } ubi->avail_pebs -= 1; available_consumed = 1; } spin_unlock(&ubi->volumes_lock); ubi_msg(ubi, "mark PEB %d as bad", pnum); err = ubi_io_mark_bad(ubi, pnum); if (err) goto out_ro; spin_lock(&ubi->volumes_lock); if (ubi->beb_rsvd_pebs > 0) { if (available_consumed) { /* * The amount of reserved PEBs increased since we last * checked. */ ubi->avail_pebs += 1; available_consumed = 0; } ubi->beb_rsvd_pebs -= 1; } ubi->bad_peb_count += 1; ubi->good_peb_count -= 1; ubi_calculate_reserved(ubi); if (available_consumed) ubi_warn(ubi, "no PEBs in the reserved pool, used an available PEB"); else if (ubi->beb_rsvd_pebs) ubi_msg(ubi, "%d PEBs left in the reserve", ubi->beb_rsvd_pebs); else ubi_warn(ubi, "last PEB from the reserve was used"); spin_unlock(&ubi->volumes_lock); return err; out_ro: if (available_consumed) { spin_lock(&ubi->volumes_lock); ubi->avail_pebs += 1; spin_unlock(&ubi->volumes_lock); } ubi_ro_mode(ubi); return err; } static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, int shutdown) { int ret; if (shutdown) { struct ubi_wl_entry *e = wl_wrk->e; dbg_wl("cancel erasure of PEB %d EC %d", e->pnum, e->ec); kfree(wl_wrk); wl_entry_destroy(ubi, e); return 0; } ret = __erase_worker(ubi, wl_wrk); kfree(wl_wrk); return ret; } /** * ubi_wl_put_peb - return a PEB to the wear-leveling sub-system. * @ubi: UBI device description object * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @pnum: physical eraseblock to return * @torture: if this physical eraseblock has to be tortured * * This function is called to return physical eraseblock @pnum to the pool of * free physical eraseblocks. The @torture flag has to be set if an I/O error * occurred to this @pnum and it has to be tested. This function returns zero * in case of success, and a negative error code in case of failure. */ int ubi_wl_put_peb(struct ubi_device *ubi, int vol_id, int lnum, int pnum, int torture) { int err; struct ubi_wl_entry *e; dbg_wl("PEB %d", pnum); ubi_assert(pnum >= 0); ubi_assert(pnum < ubi->peb_count); down_read(&ubi->fm_protect); retry: spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (!e) { /* * This wl entry has been removed for some errors by other * process (eg. wear leveling worker), corresponding process * (except __erase_worker, which cannot concurrent with * ubi_wl_put_peb) will set ubi ro_mode at the same time, * just ignore this wl entry. */ spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return 0; } if (e == ubi->move_from) { /* * User is putting the physical eraseblock which was selected to * be moved. It will be scheduled for erasure in the * wear-leveling worker. */ dbg_wl("PEB %d is being moved, wait", pnum); spin_unlock(&ubi->wl_lock); /* Wait for the WL worker by taking the @ubi->move_mutex */ mutex_lock(&ubi->move_mutex); mutex_unlock(&ubi->move_mutex); goto retry; } else if (e == ubi->move_to) { /* * User is putting the physical eraseblock which was selected * as the target the data is moved to. It may happen if the EBA * sub-system already re-mapped the LEB in 'ubi_eba_copy_leb()' * but the WL sub-system has not put the PEB to the "used" tree * yet, but it is about to do this. So we just set a flag which * will tell the WL worker that the PEB is not needed anymore * and should be scheduled for erasure. */ dbg_wl("PEB %d is the target of data moving", pnum); ubi_assert(!ubi->move_to_put); ubi->move_to_put = 1; spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return 0; } else { if (in_wl_tree(e, &ubi->used)) { self_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else if (in_wl_tree(e, &ubi->scrub)) { self_check_in_wl_tree(ubi, e, &ubi->scrub); rb_erase(&e->u.rb, &ubi->scrub); } else if (in_wl_tree(e, &ubi->erroneous)) { self_check_in_wl_tree(ubi, e, &ubi->erroneous); rb_erase(&e->u.rb, &ubi->erroneous); ubi->erroneous_peb_count -= 1; ubi_assert(ubi->erroneous_peb_count >= 0); /* Erroneous PEBs should be tortured */ torture = 1; } else { err = prot_queue_del(ubi, e->pnum); if (err) { ubi_err(ubi, "PEB %d not found", pnum); ubi_ro_mode(ubi); spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return err; } } } spin_unlock(&ubi->wl_lock); err = schedule_erase(ubi, e, vol_id, lnum, torture, false); if (err) { spin_lock(&ubi->wl_lock); wl_tree_add(e, &ubi->used); spin_unlock(&ubi->wl_lock); } up_read(&ubi->fm_protect); return err; } /** * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule * * If a bit-flip in a physical eraseblock is detected, this physical eraseblock * needs scrubbing. This function schedules a physical eraseblock for * scrubbing which is done in background. This function returns zero in case of * success and a negative error code in case of failure. */ int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum) { struct ubi_wl_entry *e; ubi_msg(ubi, "schedule PEB %d for scrubbing", pnum); retry: spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub) || in_wl_tree(e, &ubi->erroneous)) { spin_unlock(&ubi->wl_lock); return 0; } if (e == ubi->move_to) { /* * This physical eraseblock was used to move data to. The data * was moved but the PEB was not yet inserted to the proper * tree. We should just wait a little and let the WL worker * proceed. */ spin_unlock(&ubi->wl_lock); dbg_wl("the PEB %d is not in proper tree, retry", pnum); yield(); goto retry; } if (in_wl_tree(e, &ubi->used)) { self_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else { int err; err = prot_queue_del(ubi, e->pnum); if (err) { ubi_err(ubi, "PEB %d not found", pnum); ubi_ro_mode(ubi); spin_unlock(&ubi->wl_lock); return err; } } wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); /* * Technically scrubbing is the same as wear-leveling, so it is done * by the WL worker. */ return ensure_wear_leveling(ubi, 0); } /** * ubi_wl_flush - flush all pending works. * @ubi: UBI device description object * @vol_id: the volume id to flush for * @lnum: the logical eraseblock number to flush for * * This function executes all pending works for a particular volume id / * logical eraseblock number pair. If either value is set to %UBI_ALL, then it * acts as a wildcard for all of the corresponding volume numbers or logical * eraseblock numbers. It returns zero in case of success and a negative error * code in case of failure. */ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) { int err = 0; int found = 1; /* * Erase while the pending works queue is not empty, but not more than * the number of currently pending works. */ dbg_wl("flush pending work for LEB %d:%d (%d pending works)", vol_id, lnum, ubi->works_count); while (found) { struct ubi_work *wrk, *tmp; found = 0; down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); list_for_each_entry_safe(wrk, tmp, &ubi->works, list) { if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) && (lnum == UBI_ALL || wrk->lnum == lnum)) { list_del(&wrk->list); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); err = wrk->func(ubi, wrk, 0); if (err) { up_read(&ubi->work_sem); return err; } spin_lock(&ubi->wl_lock); found = 1; break; } } spin_unlock(&ubi->wl_lock); up_read(&ubi->work_sem); } /* * Make sure all the works which have been done in parallel are * finished. */ down_write(&ubi->work_sem); up_write(&ubi->work_sem); return err; } static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e) { if (in_wl_tree(e, &ubi->scrub)) return false; else if (in_wl_tree(e, &ubi->erroneous)) return false; else if (ubi->move_from == e) return false; else if (ubi->move_to == e) return false; return true; } /** * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule * @force: don't read the block, assume bitflips happened and take action. * * This function reads the given eraseblock and checks if bitflips occured. * In case of bitflips, the eraseblock is scheduled for scrubbing. * If scrubbing is forced with @force, the eraseblock is not read, * but scheduled for scrubbing right away. * * Returns: * %EINVAL, PEB is out of range * %ENOENT, PEB is no longer used by UBI * %EBUSY, PEB cannot be checked now or a check is currently running on it * %EAGAIN, bit flips happened but scrubbing is currently not possible * %EUCLEAN, bit flips happened and PEB is scheduled for scrubbing * %0, no bit flips detected */ int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force) { int err = 0; struct ubi_wl_entry *e; if (pnum < 0 || pnum >= ubi->peb_count) { err = -EINVAL; goto out; } /* * Pause all parallel work, otherwise it can happen that the * erase worker frees a wl entry under us. */ down_write(&ubi->work_sem); /* * Make sure that the wl entry does not change state while * inspecting it. */ spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (!e) { spin_unlock(&ubi->wl_lock); err = -ENOENT; goto out_resume; } /* * Does it make sense to check this PEB? */ if (!scrub_possible(ubi, e)) { spin_unlock(&ubi->wl_lock); err = -EBUSY; goto out_resume; } spin_unlock(&ubi->wl_lock); if (!force) { mutex_lock(&ubi->buf_mutex); err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); mutex_unlock(&ubi->buf_mutex); } if (force || err == UBI_IO_BITFLIPS) { /* * Okay, bit flip happened, let's figure out what we can do. */ spin_lock(&ubi->wl_lock); /* * Recheck. We released wl_lock, UBI might have killed the * wl entry under us. */ e = ubi->lookuptbl[pnum]; if (!e) { spin_unlock(&ubi->wl_lock); err = -ENOENT; goto out_resume; } /* * Need to re-check state */ if (!scrub_possible(ubi, e)) { spin_unlock(&ubi->wl_lock); err = -EBUSY; goto out_resume; } if (in_pq(ubi, e)) { prot_queue_del(ubi, e->pnum); wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); err = ensure_wear_leveling(ubi, 1); } else if (in_wl_tree(e, &ubi->used)) { rb_erase(&e->u.rb, &ubi->used); wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); err = ensure_wear_leveling(ubi, 1); } else if (in_wl_tree(e, &ubi->free)) { rb_erase(&e->u.rb, &ubi->free); ubi->free_count--; spin_unlock(&ubi->wl_lock); /* * This PEB is empty we can schedule it for * erasure right away. No wear leveling needed. */ err = schedule_erase(ubi, e, UBI_UNKNOWN, UBI_UNKNOWN, force ? 0 : 1, true); } else { spin_unlock(&ubi->wl_lock); err = -EAGAIN; } if (!err && !force) err = -EUCLEAN; } else { err = 0; } out_resume: up_write(&ubi->work_sem); out: return err; } /** * tree_destroy - destroy an RB-tree. * @ubi: UBI device description object * @root: the root of the tree to destroy */ static void tree_destroy(struct ubi_device *ubi, struct rb_root *root) { struct rb_node *rb; struct ubi_wl_entry *e; rb = root->rb_node; while (rb) { if (rb->rb_left) rb = rb->rb_left; else if (rb->rb_right) rb = rb->rb_right; else { e = rb_entry(rb, struct ubi_wl_entry, u.rb); rb = rb_parent(rb); if (rb) { if (rb->rb_left == &e->u.rb) rb->rb_left = NULL; else rb->rb_right = NULL; } wl_entry_destroy(ubi, e); } } } /** * ubi_thread - UBI background thread. * @u: the UBI device description object pointer */ int ubi_thread(void *u) { int failures = 0; struct ubi_device *ubi = u; ubi_msg(ubi, "background thread \"%s\" started, PID %d", ubi->bgt_name, task_pid_nr(current)); set_freezable(); for (;;) { int err; if (kthread_should_stop()) break; if (try_to_freeze()) continue; spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works) || ubi->ro_mode || !ubi->thread_enabled || ubi_dbg_is_bgt_disabled(ubi)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&ubi->wl_lock); /* * Check kthread_should_stop() after we set the task * state to guarantee that we either see the stop bit * and exit or the task state is reset to runnable such * that it's not scheduled out indefinitely and detects * the stop bit at kthread_should_stop(). */ if (kthread_should_stop()) { set_current_state(TASK_RUNNING); break; } schedule(); continue; } spin_unlock(&ubi->wl_lock); err = do_work(ubi, NULL); if (err) { ubi_err(ubi, "%s: work failed with error code %d", ubi->bgt_name, err); if (failures++ > WL_MAX_FAILURES) { /* * Too many failures, disable the thread and * switch to read-only mode. */ ubi_msg(ubi, "%s: %d consecutive failures", ubi->bgt_name, WL_MAX_FAILURES); ubi_ro_mode(ubi); ubi->thread_enabled = 0; continue; } } else failures = 0; cond_resched(); } dbg_wl("background thread \"%s\" is killed", ubi->bgt_name); ubi->thread_enabled = 0; return 0; } /** * shutdown_work - shutdown all pending works. * @ubi: UBI device description object */ static void shutdown_work(struct ubi_device *ubi) { while (!list_empty(&ubi->works)) { struct ubi_work *wrk; wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); wrk->func(ubi, wrk, 1); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); } } /** * erase_aeb - erase a PEB given in UBI attach info PEB * @ubi: UBI device description object * @aeb: UBI attach info PEB * @sync: If true, erase synchronously. Otherwise schedule for erasure */ static int erase_aeb(struct ubi_device *ubi, struct ubi_ainf_peb *aeb, bool sync) { struct ubi_wl_entry *e; int err; e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) return -ENOMEM; e->pnum = aeb->pnum; e->ec = aeb->ec; ubi->lookuptbl[e->pnum] = e; if (sync) { err = ubi_sync_erase(ubi, e, false); if (err) goto out_free; wl_tree_add(e, &ubi->free); ubi->free_count++; } else { err = schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0, false); if (err) goto out_free; } return 0; out_free: wl_entry_destroy(ubi, e); return err; } /** * ubi_wl_init - initialize the WL sub-system using attaching information. * @ubi: UBI device description object * @ai: attaching information * * This function returns zero in case of success, and a negative error code in * case of failure. */ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) { int err, i, reserved_pebs, found_pebs = 0; struct rb_node *rb1, *rb2; struct ubi_ainf_volume *av; struct ubi_ainf_peb *aeb, *tmp; struct ubi_wl_entry *e; ubi->used = ubi->erroneous = ubi->free = ubi->scrub = RB_ROOT; spin_lock_init(&ubi->wl_lock); mutex_init(&ubi->move_mutex); init_rwsem(&ubi->work_sem); ubi->max_ec = ai->max_ec; INIT_LIST_HEAD(&ubi->works); sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num); err = -ENOMEM; ubi->lookuptbl = kcalloc(ubi->peb_count, sizeof(void *), GFP_KERNEL); if (!ubi->lookuptbl) return err; for (i = 0; i < UBI_PROT_QUEUE_LEN; i++) INIT_LIST_HEAD(&ubi->pq[i]); ubi->pq_head = 0; ubi->free_count = 0; list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) { cond_resched(); err = erase_aeb(ubi, aeb, false); if (err) goto out_free; found_pebs++; } list_for_each_entry(aeb, &ai->free, u.list) { cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) { err = -ENOMEM; goto out_free; } e->pnum = aeb->pnum; e->ec = aeb->ec; ubi_assert(e->ec >= 0); wl_tree_add(e, &ubi->free); ubi->free_count++; ubi->lookuptbl[e->pnum] = e; found_pebs++; } ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) { ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb) { cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) { err = -ENOMEM; goto out_free; } e->pnum = aeb->pnum; e->ec = aeb->ec; ubi->lookuptbl[e->pnum] = e; if (!aeb->scrub) { dbg_wl("add PEB %d EC %d to the used tree", e->pnum, e->ec); wl_tree_add(e, &ubi->used); } else { dbg_wl("add PEB %d EC %d to the scrub tree", e->pnum, e->ec); wl_tree_add(e, &ubi->scrub); } found_pebs++; } } list_for_each_entry(aeb, &ai->fastmap, u.list) { cond_resched(); e = ubi_find_fm_block(ubi, aeb->pnum); if (e) { ubi_assert(!ubi->lookuptbl[e->pnum]); ubi->lookuptbl[e->pnum] = e; } else { bool sync = false; /* * Usually old Fastmap PEBs are scheduled for erasure * and we don't have to care about them but if we face * an power cut before scheduling them we need to * take care of them here. */ if (ubi->lookuptbl[aeb->pnum]) continue; /* * The fastmap update code might not find a free PEB for * writing the fastmap anchor to and then reuses the * current fastmap anchor PEB. When this PEB gets erased * and a power cut happens before it is written again we * must make sure that the fastmap attach code doesn't * find any outdated fastmap anchors, hence we erase the * outdated fastmap anchor PEBs synchronously here. */ if (aeb->vol_id == UBI_FM_SB_VOLUME_ID) sync = true; err = erase_aeb(ubi, aeb, sync); if (err) goto out_free; } found_pebs++; } dbg_wl("found %i PEBs", found_pebs); ubi_assert(ubi->good_peb_count == found_pebs); reserved_pebs = WL_RESERVED_PEBS; ubi_fastmap_init(ubi, &reserved_pebs); if (ubi->avail_pebs < reserved_pebs) { ubi_err(ubi, "no enough physical eraseblocks (%d, need %d)", ubi->avail_pebs, reserved_pebs); if (ubi->corr_peb_count) ubi_err(ubi, "%d PEBs are corrupted and not used", ubi->corr_peb_count); err = -ENOSPC; goto out_free; } ubi->avail_pebs -= reserved_pebs; ubi->rsvd_pebs += reserved_pebs; /* Schedule wear-leveling if needed */ err = ensure_wear_leveling(ubi, 0); if (err) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP if (!ubi->ro_mode && !ubi->fm_disabled) ubi_ensure_anchor_pebs(ubi); #endif return 0; out_free: shutdown_work(ubi); tree_destroy(ubi, &ubi->used); tree_destroy(ubi, &ubi->free); tree_destroy(ubi, &ubi->scrub); kfree(ubi->lookuptbl); return err; } /** * protection_queue_destroy - destroy the protection queue. * @ubi: UBI device description object */ static void protection_queue_destroy(struct ubi_device *ubi) { int i; struct ubi_wl_entry *e, *tmp; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) { list_for_each_entry_safe(e, tmp, &ubi->pq[i], u.list) { list_del(&e->u.list); wl_entry_destroy(ubi, e); } } } /** * ubi_wl_close - close the wear-leveling sub-system. * @ubi: UBI device description object */ void ubi_wl_close(struct ubi_device *ubi) { dbg_wl("close the WL sub-system"); ubi_fastmap_close(ubi); shutdown_work(ubi); protection_queue_destroy(ubi); tree_destroy(ubi, &ubi->used); tree_destroy(ubi, &ubi->erroneous); tree_destroy(ubi, &ubi->free); tree_destroy(ubi, &ubi->scrub); kfree(ubi->lookuptbl); } /** * self_check_ec - make sure that the erase counter of a PEB is correct. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @ec: the erase counter to check * * This function returns zero if the erase counter of physical eraseblock @pnum * is equivalent to @ec, and a negative error code if not or if an error * occurred. */ static int self_check_ec(struct ubi_device *ubi, int pnum, int ec) { int err; long long read_ec; struct ubi_ec_hdr *ec_hdr; if (!ubi_dbg_chk_gen(ubi)) return 0; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0); if (err && err != UBI_IO_BITFLIPS) { /* The header does not have to exist */ err = 0; goto out_free; } read_ec = be64_to_cpu(ec_hdr->ec); if (ec != read_ec && read_ec - ec > 1) { ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_err(ubi, "read EC is %lld, should be %d", read_ec, ec); dump_stack(); err = 1; } else err = 0; out_free: kfree(ec_hdr); return err; } /** * self_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree. * @ubi: UBI device description object * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns zero if @e is in the @root RB-tree and %-EINVAL if it * is not. */ static int self_check_in_wl_tree(const struct ubi_device *ubi, struct ubi_wl_entry *e, struct rb_root *root) { if (!ubi_dbg_chk_gen(ubi)) return 0; if (in_wl_tree(e, root)) return 0; ubi_err(ubi, "self-check failed for PEB %d, EC %d, RB-tree %p ", e->pnum, e->ec, root); dump_stack(); return -EINVAL; } /** * self_check_in_pq - check if wear-leveling entry is in the protection * queue. * @ubi: UBI device description object * @e: the wear-leveling entry to check * * This function returns zero if @e is in @ubi->pq and %-EINVAL if it is not. */ static int self_check_in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e) { if (!ubi_dbg_chk_gen(ubi)) return 0; if (in_pq(ubi, e)) return 0; ubi_err(ubi, "self-check failed for PEB %d, EC %d, Protect queue", e->pnum, e->ec); dump_stack(); return -EINVAL; } #ifndef CONFIG_MTD_UBI_FASTMAP static struct ubi_wl_entry *get_peb_for_wl(struct ubi_device *ubi) { struct ubi_wl_entry *e; e = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF, 0); self_check_in_wl_tree(ubi, e, &ubi->free); ubi->free_count--; ubi_assert(ubi->free_count >= 0); rb_erase(&e->u.rb, &ubi->free); return e; } /** * produce_free_peb - produce a free physical eraseblock. * @ubi: UBI device description object * * This function tries to make a free PEB by means of synchronous execution of * pending works. This may be needed if, for example the background thread is * disabled. Returns zero in case of success and a negative error code in case * of failure. */ static int produce_free_peb(struct ubi_device *ubi) { int err; while (!ubi->free.rb_node && ubi->works_count) { spin_unlock(&ubi->wl_lock); dbg_wl("do one work synchronously"); err = do_work(ubi, NULL); spin_lock(&ubi->wl_lock); if (err) return err; } return 0; } /** * ubi_wl_get_peb - get a physical eraseblock. * @ubi: UBI device description object * * This function returns a physical eraseblock in case of success and a * negative error code in case of failure. * Returns with ubi->fm_eba_sem held in read mode! */ int ubi_wl_get_peb(struct ubi_device *ubi) { int err; struct ubi_wl_entry *e; retry: down_read(&ubi->fm_eba_sem); spin_lock(&ubi->wl_lock); if (!ubi->free.rb_node) { if (ubi->works_count == 0) { ubi_err(ubi, "no free eraseblocks"); ubi_assert(list_empty(&ubi->works)); spin_unlock(&ubi->wl_lock); return -ENOSPC; } err = produce_free_peb(ubi); if (err < 0) { spin_unlock(&ubi->wl_lock); return err; } spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_eba_sem); goto retry; } e = wl_get_wle(ubi); prot_queue_add(ubi, e); spin_unlock(&ubi->wl_lock); err = ubi_self_check_all_ff(ubi, e->pnum, ubi->vid_hdr_aloffset, ubi->peb_size - ubi->vid_hdr_aloffset); if (err) { ubi_err(ubi, "new PEB %d does not contain all 0xFF bytes", e->pnum); return err; } return e->pnum; } #else #include "fastmap-wl.c" #endif
475 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __VDSO_MATH64_H #define __VDSO_MATH64_H static __always_inline u32 __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { u32 ret = 0; while (dividend >= divisor) { /* The following asm() prevents the compiler from optimising this loop into a modulo operation. */ asm("" : "+rm"(dividend)); dividend -= divisor; ret++; } *remainder = dividend; return ret; } #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_add_u64_shr static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { return (u64)((((unsigned __int128)a * mul) + b) >> shift); } #endif /* mul_u64_u32_add_u64_shr */ #else #ifndef mul_u64_u32_add_u64_shr #ifndef mul_u32_u32 static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #define mul_u32_u32 mul_u32_u32 #endif static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { u32 ah = a >> 32, al = a; bool ovf; u64 ret; ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret); ret >>= shift; if (ovf && shift) ret += 1ULL << (64 - shift); if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_add_u64_shr */ #endif #endif /* __VDSO_MATH64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 /* * include/linux/ktime.h * * ktime_t - nanosecond-resolution time format. * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * * Roman Zippel provided the ideas and primary code snippets of * the ktime_t union and further simplifications of the original * code. * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H #include <asm/bug.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/types.h> /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value * @secs: seconds to set * @nsecs: nanoseconds to set * * Return: The ktime_t representation of the value. */ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) { if (unlikely(secs >= KTIME_SEC_MAX)) return KTIME_MAX; return secs * NSEC_PER_SEC + (s64)nsecs; } /* Subtract two ktime_t variables. rem = lhs -rhs: */ #define ktime_sub(lhs, rhs) ((lhs) - (rhs)) /* Add two ktime_t variables. res = lhs + rhs: */ #define ktime_add(lhs, rhs) ((lhs) + (rhs)) /* * Same as ktime_add(), but avoids undefined behaviour on overflow; however, * this means that you must check the result for overflow yourself. */ #define ktime_add_unsafe(lhs, rhs) ((u64) (lhs) + (rhs)) /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: */ #define ktime_add_ns(kt, nsval) ((kt) + (nsval)) /* * Subtract a scalar nanosecod from a ktime_t variable * res = kt - nsval: */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { return kt; } /** * ktime_compare - Compares two ktime_t variables for less, greater or equal * @cmp1: comparable1 * @cmp2: comparable2 * * Return: ... * cmp1 < cmp2: return <0 * cmp1 == cmp2: return 0 * cmp1 > cmp2: return >0 */ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) { if (cmp1 < cmp2) return -1; if (cmp1 > cmp2) return 1; return 0; } /** * ktime_after - Compare if a ktime_t value is bigger than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened after cmp2. */ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) > 0; } /** * ktime_before - Compare if a ktime_t value is smaller than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened before cmp2. */ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) < 0; } #if BITS_PER_LONG < 64 extern s64 __ktime_divns(const ktime_t kt, s64 div); static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * Negative divisors could cause an inf loop, * so bug out here. */ BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { s64 ns = kt; u64 tmp = ns < 0 ? -ns : ns; do_div(tmp, div); return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * 32-bit implementation cannot handle negative divisors, * so catch them on 64bit as well. */ WARN_ON(div < 0); return kt / div; } #endif static inline s64 ktime_to_us(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_USEC); } static inline s64 ktime_to_ms(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_MSEC); } static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_us(ktime_sub(later, earlier)); } static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_ms(ktime_sub(later, earlier)); } static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) { return ktime_add_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec) { return ktime_add_ns(kt, msec * NSEC_PER_MSEC); } static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) { return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) { return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); } extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data * @kt: the ktime_t variable to convert * @ts: the timespec variable to store the result in * * Return: %true if there was a successful conversion, %false if kt was 0. */ static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt, struct timespec64 *ts) { if (kt) { *ts = ktime_to_timespec64(kt); return true; } else { return false; } } #include <vdso/ktime.h> static inline ktime_t ns_to_ktime(u64 ns) { return ns; } static inline ktime_t us_to_ktime(u64 us) { return us * NSEC_PER_USEC; } static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; } # include <linux/timekeeping.h> #endif
111 21 21 21 21 111 64 98 34 110 111 56 1 111 111 111 111 111 110 1 111 111 111 15 99 47 80 80 77 80 79 71 71 71 71 71 80 1 80 13 80 6 108 99 47 100 100 89 89 89 89 100 1 15 99 4 9 13 111 111 111 111 111 7 111 108 79 111 15 9 16 16 111 16 7 7 111 111 111 111 111 111 111 111 111 111 111 111 110 16 111 111 16 16 97 97 97 97 97 111 111 111 111 111 110 111 96 111 111 16 111 111 4 4 4 6 6 4 4 6 6 6 5 5 5 33 34 34 33 33 34 100 16 16 15 34 34 34 34 34 34 34 34 110 111 34 96 18 111 111 110 110 111 111 111 16 16 12 5 110 111 100 100 100 100 191 191 191 191 191 17 17 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2012 Google, Inc. * * Foreground allocator code: allocate buckets from freelist, and allocate in * sector granularity from writepoints. * * bch2_bucket_alloc() allocates a single bucket from a specific device. * * bch2_bucket_alloc_set() allocates one or more buckets from different devices * in a given filesystem. */ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_gc.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" #include "error.h" #include "io_write.h" #include "journal.h" #include "movinggc.h" #include "nocow_locking.h" #include "trace.h" #include <linux/math64.h> #include <linux/rculist.h> #include <linux/rcupdate.h> static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, struct mutex *lock) { if (!mutex_trylock(lock)) { bch2_trans_unlock(trans); mutex_lock(lock); } } const char * const bch2_watermarks[] = { #define x(t) #t, BCH_WATERMARKS() #undef x NULL }; /* * Open buckets represent a bucket that's currently being allocated from. They * serve two purposes: * * - They track buckets that have been partially allocated, allowing for * sub-bucket sized allocations - they're used by the sector allocator below * * - They provide a reference to the buckets they own that mark and sweep GC * can find, until the new allocation has a pointer to it inserted into the * btree * * When allocating some space with the sector allocator, the allocation comes * with a reference to an open bucket - the caller is required to put that * reference _after_ doing the index update that makes its allocation reachable. */ void bch2_reset_alloc_cursors(struct bch_fs *c) { rcu_read_lock(); for_each_member_device_rcu(c, ca, NULL) memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); rcu_read_unlock(); } static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) { open_bucket_idx_t idx = ob - c->open_buckets; open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); ob->hash = *slot; *slot = idx; } static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) { open_bucket_idx_t idx = ob - c->open_buckets; open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); while (*slot != idx) { BUG_ON(!*slot); slot = &c->open_buckets[*slot].hash; } *slot = ob->hash; ob->hash = 0; } void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); if (ob->ec) { ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); return; } spin_lock(&ob->lock); ob->valid = false; ob->data_type = 0; spin_unlock(&ob->lock); spin_lock(&c->freelist_lock); bch2_open_bucket_hash_remove(c, ob); ob->freelist = c->open_buckets_freelist; c->open_buckets_freelist = ob - c->open_buckets; c->open_buckets_nr_free++; ca->nr_open_buckets--; spin_unlock(&c->freelist_lock); closure_wake_up(&c->open_buckets_wait); } void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, unsigned dev) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) bch2_ec_bucket_cancel(c, ob); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) { struct open_bucket *ob; BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); ob = c->open_buckets + c->open_buckets_freelist; c->open_buckets_freelist = ob->freelist; atomic_set(&ob->pin, 1); ob->data_type = 0; c->open_buckets_nr_free--; return ob; } static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) return false; return bch2_is_superblock_bucket(ca, b); } static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); rcu_read_lock(); bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; rcu_read_unlock(); ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = ob - c->open_buckets; spin_unlock(&c->freelist_lock); closure_wake_up(&c->open_buckets_wait); closure_wake_up(&c->freelist_wait); } static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { case BCH_WATERMARK_interior_updates: return 0; case BCH_WATERMARK_reclaim: return OPEN_BUCKETS_COUNT / 6; case BCH_WATERMARK_btree: case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; case BCH_WATERMARK_copygc: return OPEN_BUCKETS_COUNT / 3; default: return OPEN_BUCKETS_COUNT / 2; } } static inline bool may_alloc_bucket(struct bch_fs *c, struct bpos bucket, struct bucket_alloc_state *s) { if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { s->skipped_open++; return false; } u64 journal_seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, bucket.inode, bucket.offset); if (journal_seq_ready > c->journal.flushed_seq_ondisk) { if (journal_seq_ready > c->journal.flushing_seq) s->need_journal_commit++; s->skipped_need_journal_commit++; return false; } if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { s->skipped_nocow++; return false; } return true; } static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, u64 bucket, u8 gen, enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { s->skipped_nouse++; return NULL; } spin_lock(&c->freelist_lock); if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { spin_unlock(&c->freelist_lock); s->skipped_open++; return NULL; } struct open_bucket *ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->dev = ca->dev_idx; ob->gen = gen; ob->bucket = bucket; spin_unlock(&ob->lock); ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); track_event_change(&c->times[BCH_TIME_blocked_allocate], false); spin_unlock(&c->freelist_lock); return ob; } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, struct bucket_alloc_state *s, struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) return NULL; u8 gen; int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); if (ret < 0) return ERR_PTR(ret); if (ret) return NULL; return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); } /* * This path is for before the freespace btree is initialized: */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { struct bch_fs *c = trans->c; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = ca->mi.first_bucket; u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; /* * Scan with an uncached iterator to avoid polluting the key cache. An * uncached iter will return a cached key if one exists, but if not * there is no other underlying protection for the associated key cache * slot. To avoid racing bucket allocations, look up the cached key slot * of any likely allocation candidate before attempting to proceed with * the allocation. This provides proper exclusion on the associated * bucket. */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_slots, k, ret) { u64 bucket = k.k->p.offset; if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { if (s->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) break; bucket = sector_to_bucket(ca, round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); s->buckets_seen++; s->skipped_mi_btree_bitmap++; continue; } struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); if (a->data_type != BCH_DATA_free) continue; /* now check the cached key to serialize concurrent allocs of the bucket */ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); ret = bkey_err(ck); if (ret) break; a = bch2_alloc_to_v4(ck, &a_convert); if (a->data_type != BCH_DATA_free) goto next; s->buckets_seen++; ob = may_alloc_bucket(c, k.k->p, s) ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, watermark, s, cl) : NULL; next: bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; } bch2_trans_iter_exit(trans, &iter); alloc_cursor = iter.pos.offset; if (!ob && ret) ob = ERR_PTR(ret); if (!ob && alloc_start > first_bucket) { alloc_cursor = alloc_start = first_bucket; goto again; } *dev_alloc_cursor = alloc_cursor; return ob; } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; again: for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, POS(ca->dev_idx, alloc_cursor), POS(ca->dev_idx, U64_MAX), 0, k, ret) { /* * peek normally dosen't trim extents - they can span iter.pos, * which is not what we want here: */ iter.k.size = iter.k.p.offset - iter.pos.offset; while (iter.k.size) { s->buckets_seen++; u64 bucket = iter.pos.offset & ~(~0ULL << 56); if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { if (s->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) goto fail; bucket = sector_to_bucket(ca, round_up(bucket_to_sector(ca, bucket + 1), 1ULL << ca->mi.btree_bitmap_shift)); alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); s->skipped_mi_btree_bitmap++; goto next; } ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; bch2_set_btree_iter_dontneed(&iter); break; } iter.k.size--; iter.pos.offset++; } next: if (ob || ret) break; } fail: bch2_trans_iter_exit(trans, &iter); BUG_ON(ob && ret); if (ret) ob = ERR_PTR(ret); if (!ob && alloc_start > ca->mi.first_bucket) { alloc_cursor = alloc_start = ca->mi.first_bucket; goto again; } return ob; } static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, enum bch_watermark watermark, enum bch_data_type data_type, struct closure *cl, struct bch_dev_usage *usage, struct bucket_alloc_state *s, struct open_bucket *ob) { struct printbuf buf = PRINTBUF; printbuf_tabstop_push(&buf, 24); prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); prt_printf(&buf, "copygc_wait\t%lu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); prt_printf(&buf, "open\t%llu\n", s->skipped_open); prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); if (!IS_ERR(ob)) { prt_printf(&buf, "allocated\t%llu\n", ob->bucket); trace_bucket_alloc(c, buf.buf); } else { prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); trace_bucket_alloc_fail(c, buf.buf); } printbuf_exit(&buf); } /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @ca: device to allocate from * @watermark: how important is this allocation? * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, enum bch_data_type data_type, struct closure *cl, bool nowait, struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; struct bucket_alloc_state s = { .btree_bitmap = data_type == BCH_DATA_btree, }; bool waiting = nowait; again: bch2_dev_usage_read_fast(ca, usage); avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) bch2_dev_do_invalidates(ca); if (!avail) { if (watermark > BCH_WATERMARK_normal && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; goto again; } track_event_change(&c->times[BCH_TIME_blocked_allocate], true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; } if (waiting) closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { s.btree_bitmap = BTREE_BITMAP_ANY; goto alloc; } if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } err: if (!ob) ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) ob->data_type = data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) count_event(c, bucket_alloc_fail); if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); return ob; } struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_watermark watermark, enum bch_data_type data_type, struct closure *cl) { struct bch_dev_usage usage; struct open_bucket *ob; bch2_trans_do(c, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, false, &usage))); return ob; } static int __dev_stripe_cmp(struct dev_stripe_state *stripe, unsigned l, unsigned r) { return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - (stripe->next_alloc[l] < stripe->next_alloc[r])); } #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, struct dev_stripe_state *stripe, struct bch_devs_mask *devs) { struct dev_alloc_list ret = { .nr = 0 }; unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) ret.data[ret.nr++] = i; bubble_sort(ret.data, ret.nr, dev_stripe_cmp); return ret; } static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct dev_stripe_state *stripe, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; u64 scale = *v / 4; if (*v + free_space_inv >= *v) *v += free_space_inv; else *v = U64_MAX; for (v = stripe->next_alloc; v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) *v = *v < scale ? 0 : *v - scale; } void bch2_dev_stripe_increment(struct bch_dev *ca, struct dev_stripe_state *stripe) { struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); bch2_dev_stripe_increment_inlined(ca, stripe, &usage); } static int add_new_bucket(struct bch_fs *c, struct open_buckets *ptrs, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); __clear_bit(ob->dev, devs_may_alloc->d); *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); if (*nr_effective >= nr_replicas) return 1; if (ob->ec) return 1; return 0; } int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct open_buckets *ptrs, struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum bch_write_flags flags, enum bch_data_type data_type, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; int ret = -BCH_ERR_insufficient_devices; BUG_ON(*nr_effective >= nr_replicas); struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); darray_for_each(devs_sorted, i) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; if (!ca->mi.durability && *have_cache) { bch2_dev_put(ca); continue; } struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) break; continue; } if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob)) { ret = 0; break; } } return ret; } /* Allocate from stripes: */ /* * if we can't allocate a new stripe because there are already too many * partially filled stripes, force allocating from an existing stripe even when * it's to a device we don't want: */ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, u16 target, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, enum bch_write_flags flags, struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; if (nr_replicas < 2) return 0; if (ec_open_bucket(c, ptrs)) return 0; struct ec_stripe_head *h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); darray_for_each(devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { ob->ec_idx = ec_idx; ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob); goto out; } } out: bch2_ec_stripe_head_put(c, h); return ret; } /* Sector allocator */ static bool want_bucket(struct bch_fs *c, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, bool *have_cache, bool ec, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); if (!test_bit(ob->dev, devs_may_alloc->d)) return false; if (ob->data_type != wp->data_type) return false; if (!ca->mi.durability && (wp->data_type == BCH_DATA_btree || ec || *have_cache)) return false; if (ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, bool ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; open_bucket_for_each(c, &wp->ptrs, ob, i) { if (!ret && want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob); else ob_push(c, &ptrs_skip, ob); } wp->ptrs = ptrs_skip; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, bool ec, enum bch_watermark watermark) { int i, ret = 0; if (!c->open_buckets_partial_nr) return 0; spin_lock(&c->freelist_lock); if (!c->open_buckets_partial_nr) goto unlock; for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; bch2_dev_usage_read_fast(ca, &usage); avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; if (!avail) continue; array_remove_item(c->open_buckets_partial, c->open_buckets_partial_nr, i); ob->on_partial_list = false; rcu_read_lock(); bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; rcu_read_unlock(); ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob); if (ret) break; } } unlock: spin_unlock(&c->freelist_lock); return ret; } static int __open_bucket_add_buckets(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, u16 target, bool erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, enum bch_write_flags flags, struct closure *_cl) { struct bch_fs *c = trans->c; struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; devs = target_rw_devs(c, wp->data_type, target); /* Don't allocate from devices we already have pointers to: */ darray_for_each(*devs_have, i) __clear_bit(*i, devs.d); open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, nr_replicas, nr_effective, have_cache, erasure_code); if (ret) return ret; ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, nr_replicas, nr_effective, have_cache, erasure_code, watermark); if (ret) return ret; if (erasure_code) { ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, target, nr_replicas, nr_effective, have_cache, watermark, flags, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, flags, wp->data_type, watermark, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && !cl && _cl) { cl = _cl; goto retry_blocking; } } return ret; } static int open_bucket_add_buckets(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, u16 target, unsigned erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, enum bch_write_flags flags, struct closure *cl) { int ret; if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { ret = __open_bucket_add_buckets(trans, ptrs, wp, devs_have, target, erasure_code, nr_replicas, nr_effective, have_cache, watermark, flags, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; if (*nr_effective >= nr_replicas) return 0; } ret = __open_bucket_add_buckets(trans, ptrs, wp, devs_have, target, false, nr_replicas, nr_effective, have_cache, watermark, flags, cl); return ret < 0 ? ret : 0; } /** * should_drop_bucket - check if this is open_bucket should go away * @ob: open_bucket to predicate on * @c: filesystem handle * @ca: if set, we're killing buckets for a particular device * @ec: if true, we're shutting down erasure coding and killing all ec * open_buckets * otherwise, return true * Returns: true if we should kill this open_bucket * * We're killing open_buckets because we're shutting down a device, erasure * coding, or the entire filesystem - check if this open_bucket matches: */ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, struct bch_dev *ca, bool ec) { if (ec) { return ob->ec != NULL; } else if (ca) { bool drop = ob->dev == ca->dev_idx; struct open_bucket *ob2; unsigned i; if (!drop && ob->ec) { unsigned nr_blocks; mutex_lock(&ob->ec->lock); nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; for (i = 0; i < nr_blocks; i++) { if (!ob->ec->blocks[i]) continue; ob2 = c->open_buckets + ob->ec->blocks[i]; drop |= ob2->dev == ca->dev_idx; } mutex_unlock(&ob->ec->lock); } return drop; } else { return true; } } static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, bool ec, struct write_point *wp) { struct open_buckets ptrs = { .nr = 0 }; struct open_bucket *ob; unsigned i; mutex_lock(&wp->lock); open_bucket_for_each(c, &wp->ptrs, ob, i) if (should_drop_bucket(ob, c, ca, ec)) bch2_open_bucket_put(c, ob); else ob_push(c, &ptrs, ob); wp->ptrs = ptrs; mutex_unlock(&wp->lock); } void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, bool ec) { unsigned i; /* Next, close write points that point to this device... */ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); mutex_lock(&c->btree_reserve_cache_lock); while (c->btree_reserve_cache_nr) { struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; bch2_open_buckets_put(c, &a->ob); } mutex_unlock(&c->btree_reserve_cache_lock); spin_lock(&c->freelist_lock); i = 0; while (i < c->open_buckets_partial_nr) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; if (should_drop_bucket(ob, c, ca, ec)) { --c->open_buckets_partial_nr; swap(c->open_buckets_partial[i], c->open_buckets_partial[c->open_buckets_partial_nr]); ob->on_partial_list = false; rcu_read_lock(); bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; rcu_read_unlock(); spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); spin_lock(&c->freelist_lock); } else { i++; } } spin_unlock(&c->freelist_lock); bch2_ec_stop_dev(c, ca); } static inline struct hlist_head *writepoint_hash(struct bch_fs *c, unsigned long write_point) { unsigned hash = hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); return &c->write_points_hash[hash]; } static struct write_point *__writepoint_find(struct hlist_head *head, unsigned long write_point) { struct write_point *wp; rcu_read_lock(); hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) goto out; wp = NULL; out: rcu_read_unlock(); return wp; } static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) { u64 stranded = c->write_points_nr * c->bucket_size_max; u64 free = bch2_fs_usage_read_short(c).free; return stranded * factor > free; } static bool try_increase_writepoints(struct bch_fs *c) { struct write_point *wp; if (c->write_points_nr == ARRAY_SIZE(c->write_points) || too_many_writepoints(c, 32)) return false; wp = c->write_points + c->write_points_nr++; hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); return true; } static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) { struct bch_fs *c = trans->c; struct write_point *wp; struct open_bucket *ob; unsigned i; mutex_lock(&c->write_points_hash_lock); if (c->write_points_nr < old_nr) { mutex_unlock(&c->write_points_hash_lock); return true; } if (c->write_points_nr == 1 || !too_many_writepoints(c, 8)) { mutex_unlock(&c->write_points_hash_lock); return false; } wp = c->write_points + --c->write_points_nr; hlist_del_rcu(&wp->node); mutex_unlock(&c->write_points_hash_lock); bch2_trans_mutex_lock_norelock(trans, &wp->lock); open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); wp->ptrs.nr = 0; mutex_unlock(&wp->lock); return true; } static struct write_point *writepoint_find(struct btree_trans *trans, unsigned long write_point) { struct bch_fs *c = trans->c; struct write_point *wp, *oldest; struct hlist_head *head; if (!(write_point & 1UL)) { wp = (struct write_point *) write_point; bch2_trans_mutex_lock_norelock(trans, &wp->lock); return wp; } head = writepoint_hash(c, write_point); restart_find: wp = __writepoint_find(head, write_point); if (wp) { lock_wp: bch2_trans_mutex_lock_norelock(trans, &wp->lock); if (wp->write_point == write_point) goto out; mutex_unlock(&wp->lock); goto restart_find; } restart_find_oldest: oldest = NULL; for (wp = c->write_points; wp < c->write_points + c->write_points_nr; wp++) if (!oldest || time_before64(wp->last_used, oldest->last_used)) oldest = wp; bch2_trans_mutex_lock_norelock(trans, &oldest->lock); bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); if (oldest >= c->write_points + c->write_points_nr || try_increase_writepoints(c)) { mutex_unlock(&c->write_points_hash_lock); mutex_unlock(&oldest->lock); goto restart_find_oldest; } wp = __writepoint_find(head, write_point); if (wp && wp != oldest) { mutex_unlock(&c->write_points_hash_lock); mutex_unlock(&oldest->lock); goto lock_wp; } wp = oldest; hlist_del_rcu(&wp->node); wp->write_point = write_point; hlist_add_head_rcu(&wp->node, head); mutex_unlock(&c->write_points_hash_lock); out: wp->last_used = local_clock(); return wp; } static noinline void deallocate_extra_replicas(struct bch_fs *c, struct open_buckets *ptrs, struct open_buckets *ptrs_no_use, unsigned extra_replicas) { struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; unsigned i; open_bucket_for_each(c, ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; ob_push(c, ptrs_no_use, ob); } else { ob_push(c, &ptrs2, ob); } } *ptrs = ptrs2; } /* * Get us an open_bucket we can allocate from, return with it locked: */ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, unsigned target, unsigned erasure_code, struct write_point_specifier write_point, struct bch_devs_list *devs_have, unsigned nr_replicas, unsigned nr_replicas_required, enum bch_watermark watermark, enum bch_write_flags flags, struct closure *cl, struct write_point **wp_ret) { struct bch_fs *c = trans->c; struct write_point *wp; struct open_bucket *ob; struct open_buckets ptrs; unsigned nr_effective, write_points_nr; bool have_cache; int ret; int i; if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; BUG_ON(!nr_replicas || !nr_replicas_required); retry: ptrs.nr = 0; nr_effective = 0; write_points_nr = c->write_points_nr; have_cache = false; *wp_ret = wp = writepoint_find(trans, write_point.v); ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, watermark, flags, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, watermark, flags, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { ret = ret2; goto alloc_done; } } /* * Only try to allocate cache (durability = 0 devices) from the * specified target: */ have_cache = true; ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 0, erasure_code, nr_replicas, &nr_effective, &have_cache, watermark, flags, cl); } else { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, &have_cache, watermark, flags, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); if (erasure_code && !ec_open_bucket(c, &ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; if (nr_effective > nr_replicas) deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); wp->ptrs = ptrs; wp->sectors_free = UINT_MAX; open_bucket_for_each(c, &wp->ptrs, ob, i) wp->sectors_free = min(wp->sectors_free, ob->sectors_free); BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); return 0; err: open_bucket_for_each(c, &wp->ptrs, ob, i) if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else open_bucket_free_unused(c, ob); wp->ptrs = ptrs; mutex_unlock(&wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) goto retry; if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ret = -BCH_ERR_bucket_alloc_blocked; if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) ret = -BCH_ERR_bucket_alloc_blocked; return ret; } struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); return (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, .gen = ob->gen, .dev = ob->dev, .offset = bucket_to_sector(ca, ob->bucket) + ca->mi.bucket_size - ob->sectors_free, }; } void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) { bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); } /* * Append pointers to the space we just allocated to @k, and mark @sectors space * as allocated out of @ob */ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) { bch2_alloc_sectors_done_inlined(c, wp); } static inline void writepoint_init(struct write_point *wp, enum bch_data_type type) { mutex_init(&wp->lock); wp->data_type = type; INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); INIT_LIST_HEAD(&wp->writes); spin_lock_init(&wp->writes_lock); } void bch2_fs_allocator_foreground_init(struct bch_fs *c) { struct open_bucket *ob; struct write_point *wp; mutex_init(&c->write_points_hash_lock); c->write_points_nr = ARRAY_SIZE(c->write_points); /* open bucket 0 is a sentinal NULL: */ spin_lock_init(&c->open_buckets[0].lock); for (ob = c->open_buckets + 1; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock_init(&ob->lock); c->open_buckets_nr_free++; ob->freelist = c->open_buckets_freelist; c->open_buckets_freelist = ob - c->open_buckets; } writepoint_init(&c->btree_write_point, BCH_DATA_btree); writepoint_init(&c->rebalance_write_point, BCH_DATA_user); writepoint_init(&c->copygc_write_point, BCH_DATA_user); for (wp = c->write_points; wp < c->write_points + c->write_points_nr; wp++) { writepoint_init(wp, BCH_DATA_user); wp->last_used = local_clock(); wp->write_point = (unsigned long) wp; hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); } } void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ prt_printf(out, "%zu ref %u ", ob - c->open_buckets, atomic_read(&ob->pin)); bch2_prt_data_type(out, data_type); prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) prt_printf(out, " ec idx %llu", ob->ec->idx); if (ob->on_partial_list) prt_str(out, " partial"); prt_newline(out); } void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; out->atomic++; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); if (ob->valid && (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } --out->atomic; } void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) { unsigned i; out->atomic++; spin_lock(&c->freelist_lock); for (i = 0; i < c->open_buckets_partial_nr; i++) bch2_open_bucket_to_text(out, c, c->open_buckets + c->open_buckets_partial[i]); spin_unlock(&c->freelist_lock); --out->atomic; } static const char * const bch2_write_point_states[] = { #define x(n) #n, WRITE_POINT_STATES() #undef x NULL }; static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob; unsigned i; prt_printf(out, "%lu: ", wp->write_point); prt_human_readable_u64(out, wp->sectors_allocated); prt_printf(out, " last wrote: "); bch2_pr_time_units(out, sched_clock() - wp->last_used); for (i = 0; i < WRITE_POINT_STATE_NR; i++) { prt_printf(out, " %s: ", bch2_write_point_states[i]); bch2_pr_time_units(out, wp->time[i]); } prt_newline(out); printbuf_indent_add(out, 2); open_bucket_for_each(c, &wp->ptrs, ob, i) bch2_open_bucket_to_text(out, c, ob); printbuf_indent_sub(out, 2); } void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) { struct write_point *wp; prt_str(out, "Foreground write points\n"); for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) bch2_write_point_to_text(out, c, wp); prt_str(out, "Copygc write point\n"); bch2_write_point_to_text(out, c, &c->copygc_write_point); prt_str(out, "Rebalance write point\n"); bch2_write_point_to_text(out, c, &c->rebalance_write_point); prt_str(out, "Btree write point\n"); bch2_write_point_to_text(out, c, &c->btree_write_point); } void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) { unsigned nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 24); prt_printf(out, "capacity\t%llu\n", c->capacity); prt_printf(out, "reserved\t%llu\n", c->reserved); prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); prt_newline(out); prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); } void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; struct bch_dev_usage stats = bch2_dev_usage_read(ca); unsigned nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; bch2_dev_usage_to_text(out, ca, &stats); prt_newline(out); prt_printf(out, "reserves:\n"); for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); prt_newline(out); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 12); printbuf_tabstop_push(out, 16); prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); } static noinline void bch2_print_allocator_stuck(struct bch_fs *c) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", c->opts.allocator_stuck_timeout); prt_printf(&buf, "Allocator debug:\n"); printbuf_indent_add(&buf, 2); bch2_fs_alloc_debug_to_text(&buf, c); printbuf_indent_sub(&buf, 2); prt_newline(&buf); for_each_online_member(c, ca) { prt_printf(&buf, "Dev %u:\n", ca->dev_idx); printbuf_indent_add(&buf, 2); bch2_dev_alloc_debug_to_text(&buf, ca); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); bch2_copygc_wait_to_text(&buf, c); printbuf_indent_sub(&buf, 2); prt_newline(&buf); prt_printf(&buf, "Journal debug:\n"); printbuf_indent_add(&buf, 2); bch2_journal_debug_to_text(&buf, &c->journal); printbuf_indent_sub(&buf, 2); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } static inline unsigned allocator_wait_timeout(struct bch_fs *c) { if (c->allocator_last_stuck && time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) return 0; return c->opts.allocator_stuck_timeout * HZ; } void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) { unsigned t = allocator_wait_timeout(c); if (t && closure_sync_timeout(cl, t)) { c->allocator_last_stuck = jiffies; bch2_print_allocator_stuck(c); } closure_sync(cl); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <linux/rcupdate.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" static struct ieee80211_link_data * ieee80211_link_or_deflink(struct ieee80211_sub_if_data *sdata, int link_id, bool require_valid) { struct ieee80211_link_data *link; if (link_id < 0) { /* * For keys, if sdata is not an MLD, we might not use * the return value at all (if it's not a pairwise key), * so in that case (require_valid==false) don't error. */ if (require_valid && ieee80211_vif_is_mld(&sdata->vif)) return ERR_PTR(-EINVAL); return &sdata->deflink; } link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return ERR_PTR(-ENOLINK); return link; } static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { bool mu_mimo_groups = false; bool mu_mimo_follow = false; if (params->vht_mumimo_groups) { u64 membership; BUILD_BUG_ON(sizeof(membership) != WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); memcpy(sdata->vif.bss_conf.mu_group.position, params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MU_GROUPS); /* don't care about endianness - just check for 0 */ memcpy(&membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); mu_mimo_groups = membership != 0; } if (params->vht_mumimo_follow_addr) { mu_mimo_follow = is_valid_ether_addr(params->vht_mumimo_follow_addr); ether_addr_copy(sdata->u.mntr.mu_follow_addr, params->vht_mumimo_follow_addr); } sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; } static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *monitor_sdata; /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE; /* * Prohibit MONITOR_FLAG_COOK_FRAMES and * MONITOR_FLAG_ACTIVE to be changed while the * interface is up. * Else we would need to add a lot of cruft * to update everything: * cooked_mntrs, monitor and all fif_* counters * reconfigure hardware */ if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) return -EBUSY; } /* also validate MU-MIMO change */ if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) monitor_sdata = sdata; else monitor_sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) return -EOPNOTSUPP; /* apply all changes now - no failures allowed */ if (monitor_sdata && (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { if (ieee80211_sdata_running(sdata)) { ieee80211_adjust_monitor_flags(sdata, -1); sdata->u.mntr.flags = params->flags; ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); } else { /* * Because the interface is down, ieee80211_do_stop * and ieee80211_do_open take care of "everything" * mentioned in the comment above. */ sdata->u.mntr.flags = params->flags; } } return 0; } static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct cfg80211_mbssid_config *params, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { sdata->vif.mbssid_tx_vif = &sdata->vif; } else { sdata->vif.mbssid_tx_vif = &tx_sdata->vif; link_conf->nontransmitted = true; link_conf->bssid_index = params->index; } if (params->ema) link_conf->ema_ap = true; return 0; } static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct wireless_dev *wdev; struct ieee80211_sub_if_data *sdata; int err; err = ieee80211_if_add(local, name, name_assign_type, &wdev, type, params); if (err) return ERR_PTR(err); sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (type == NL80211_IFTYPE_MONITOR) { err = ieee80211_set_mon_options(sdata, params); if (err) { ieee80211_if_remove(sdata); return NULL; } } /* Let the driver know that an interface is going to be added. * Indicate so only for interface types that will be added to the * driver. */ switch (type) { case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_MONITOR: if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || !(params->flags & MONITOR_FLAG_ACTIVE)) break; fallthrough; default: drv_prep_add_interface(local, ieee80211_vif_type_p2p(&sdata->vif)); break; } return wdev; } static int ieee80211_del_iface(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_if_remove(IEEE80211_WDEV_TO_SUB_IF(wdev)); return 0; } static int ieee80211_change_iface(struct wiphy *wiphy, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret; lockdep_assert_wiphy(local->hw.wiphy); ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) { RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (params->use_4addr == ifmgd->use_4addr) return 0; /* FIXME: no support for 4-addr MLO yet */ if (ieee80211_vif_is_mld(&sdata->vif)) return -EOPNOTSUPP; sdata->u.mgd.use_4addr = params->use_4addr; if (!ifmgd->associated) return 0; sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid); if (sta) drv_sta_set_4addr(local, sdata, &sta->sta, params->use_4addr); if (params->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { ret = ieee80211_set_mon_options(sdata, params); if (ret) return ret; } return 0; } static int ieee80211_start_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; return ieee80211_do_open(wdev, true); } static void ieee80211_stop_p2p_device(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; ret = ieee80211_do_open(wdev, true); if (ret) return ret; ret = drv_start_nan(sdata->local, sdata, conf); if (ret) ieee80211_sdata_stop(sdata); sdata->u.nan.conf = *conf; return ret; } static void ieee80211_stop_nan(struct wiphy *wiphy, struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); drv_stop_nan(sdata->local, sdata); ieee80211_sdata_stop(sdata); } static int ieee80211_nan_change_conf(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_conf new_conf; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; new_conf = sdata->u.nan.conf; if (changes & CFG80211_NAN_CONF_CHANGED_PREF) new_conf.master_pref = conf->master_pref; if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) new_conf.bands = conf->bands; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); if (!ret) sdata->u.nan.conf = new_conf; return ret; } static int ieee80211_add_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); int ret; if (sdata->vif.type != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; spin_lock_bh(&sdata->u.nan.func_lock); ret = idr_alloc(&sdata->u.nan.function_inst_ids, nan_func, 1, sdata->local->hw.max_nan_de_entries + 1, GFP_ATOMIC); spin_unlock_bh(&sdata->u.nan.func_lock); if (ret < 0) return ret; nan_func->instance_id = ret; WARN_ON(nan_func->instance_id == 0); ret = drv_add_nan_func(sdata->local, sdata, nan_func); if (ret) { spin_lock_bh(&sdata->u.nan.func_lock); idr_remove(&sdata->u.nan.function_inst_ids, nan_func->instance_id); spin_unlock_bh(&sdata->u.nan.func_lock); } return ret; } static struct cfg80211_nan_func * ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata, u64 cookie) { struct cfg80211_nan_func *func; int id; lockdep_assert_held(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) { if (func->cookie == cookie) return func; } return NULL; } static void ieee80211_del_nan_func(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct cfg80211_nan_func *func; u8 instance_id = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN || !ieee80211_sdata_running(sdata)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = ieee80211_find_nan_func_by_cookie(sdata, cookie); if (func) instance_id = func->instance_id; spin_unlock_bh(&sdata->u.nan.func_lock); if (instance_id) drv_del_nan_func(sdata->local, sdata, instance_id); } static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; ieee80211_check_fast_xmit_iface(sdata); return 0; } static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, const u8 *mac_addr, u8 key_idx) { struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; struct sta_info *sta; int ret = -EINVAL; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) return -EINVAL; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return -EINVAL; if (sta->ptk_idx == key_idx) return 0; key = wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]); if (key && key->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) ret = ieee80211_set_tx_key(key); return ret; } static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); struct ieee80211_local *local = sdata->local; struct sta_info *sta = NULL; struct ieee80211_key *key; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; if (IS_ERR(link)) return PTR_ERR(link); if (WARN_ON(pairwise && link_id >= 0)) return -EINVAL; if (pairwise && params->mode == NL80211_KEY_SET_TX) return ieee80211_set_tx(sdata, mac_addr, key_idx); /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_WEP104: if (link_id >= 0) return -EINVAL; if (WARN_ON_ONCE(fips_enabled)) return -EINVAL; break; default: break; } key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, params->key, params->seq_len, params->seq); if (IS_ERR(key)) return PTR_ERR(key); if (pairwise) { key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; key->conf.link_id = -1; } else { key->conf.link_id = link->link_id; } if (params->mode == NL80211_KEY_NO_TX) key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX; if (mac_addr) { sta = sta_info_get_bss(sdata, mac_addr); /* * The ASSOC test makes sure the driver is ready to * receive the key. When wpa_supplicant has roamed * using FT, it attempts to set the key before * association has completed, this rejects that attempt * so it will set the key again after association. * * TODO: accept the key if we have a station entry and * add it to the device after the station. */ if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { ieee80211_key_free_unused(key); return -ENOENT; } } switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: /* no MFP (yet) */ break; case NL80211_IFTYPE_MESH_POINT: #ifdef CONFIG_MAC80211_MESH if (sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; #endif case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_OCB: /* shouldn't happen */ WARN_ON_ONCE(1); break; } err = ieee80211_key_link(key, link, sta); /* KRACK protection, shouldn't happen but just silently accept key */ if (err == -EALREADY) err = 0; return err; } static struct ieee80211_key * ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_key *key; if (link_id >= 0) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return NULL; } if (mac_addr) { struct sta_info *sta; struct link_sta_info *link_sta; sta = sta_info_get_bss(sdata, mac_addr); if (!sta) return NULL; if (link_id >= 0) { link_sta = rcu_dereference_check(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) return NULL; } else { link_sta = &sta->deflink; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]); if (!pairwise && key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) return wiphy_dereference(local->hw.wiphy, link_sta->gtk[key_idx]); return NULL; } if (pairwise && key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]); key = wiphy_dereference(local->hw.wiphy, link->gtk[key_idx]); if (key) return key; /* or maybe it was a WEP key */ if (key_idx < NUM_DEFAULT_KEYS) return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]); return NULL; } static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; lockdep_assert_wiphy(local->hw.wiphy); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) return -ENOENT; ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION); return 0; } static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params *params)) { struct ieee80211_sub_if_data *sdata; u8 seq[6] = {0}; struct key_params params; struct ieee80211_key *key; u64 pn64; u32 iv32; u16 iv16; int err = -ENOENT; struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr); if (!key) goto out; memset(&params, 0, sizeof(params)); params.cipher = key->conf.cipher; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: pn64 = atomic64_read(&key->conf.tx_pn); iv32 = TKIP_PN_TO_IV32(pn64); iv16 = TKIP_PN_TO_IV16(pn64); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); iv32 = kseq.tkip.iv32; iv16 = kseq.tkip.iv16; } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; seq[2] = iv32 & 0xff; seq[3] = (iv32 >> 8) & 0xff; seq[4] = (iv32 >> 16) & 0xff; seq[5] = (iv32 >> 24) & 0xff; params.seq = seq; params.seq_len = 6; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_cmac)); fallthrough; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), aes_gmac)); fallthrough; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != offsetof(typeof(kseq), gcmp)); if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { drv_get_key_seq(sdata->local, key, &kseq); memcpy(seq, kseq.ccmp.pn, 6); } else { pn64 = atomic64_read(&key->conf.tx_pn); seq[0] = pn64; seq[1] = pn64 >> 8; seq[2] = pn64 >> 16; seq[3] = pn64 >> 24; seq[4] = pn64 >> 32; seq[5] = pn64 >> 40; } params.seq = seq; params.seq_len = 6; break; default: if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) break; if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) break; drv_get_key_seq(sdata->local, key, &kseq); params.seq = kseq.hw.seq; params.seq_len = kseq.hw.seq_len; break; } callback(cookie, &params); err = 0; out: rcu_read_unlock(); return err; } static int ieee80211_config_default_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx, bool uni, bool multi) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, false); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_key(link, key_idx, uni, multi); return 0; } static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_mgmt_key(link, key_idx); return 0; } static int ieee80211_config_default_beacon_key(struct wiphy *wiphy, struct net_device *dev, int link_id, u8 key_idx) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, link_id, true); if (IS_ERR(link)) return PTR_ERR(link); ieee80211_set_default_beacon_key(link, key_idx); return 0; } void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo) { rinfo->flags = 0; if (rate->flags & IEEE80211_TX_RC_MCS) { rinfo->flags |= RATE_INFO_FLAGS_MCS; rinfo->mcs = rate->idx; } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = ieee80211_rate_get_vht_mcs(rate); rinfo->nss = ieee80211_rate_get_vht_nss(rate); } else { struct ieee80211_supported_band *sband; sband = ieee80211_get_sband(sta->sdata); WARN_ON_ONCE(sband && !sband->bitrates); if (sband && sband->bitrates) rinfo->legacy = sband->bitrates[rate->idx].bitrate; } if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_40; else if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_80; else if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_160; else rinfo->bw = RATE_INFO_BW_20; if (rate->flags & IEEE80211_TX_RC_SHORT_GI) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_by_idx(sdata, idx); if (sta) { ret = 0; memcpy(mac, sta->sta.addr, ETH_ALEN); sta_set_sinfo(sta, sinfo, true); } return ret; } static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev, int idx, struct survey_info *survey) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); return drv_get_survey(local, idx, survey); } static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sta_info *sta; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, mac); if (sta) { ret = 0; sta_set_sinfo(sta, sinfo, true); } return ret; } static int ieee80211_set_monitor_channel(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; lockdep_assert_wiphy(local->hw.wiphy); sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, &chanreq.oper)) return 0; sdata = wiphy_dereference(wiphy, local->monitor_sdata); if (!sdata) goto done; } if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) && cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, &chanreq.oper)) return 0; ieee80211_link_release_channel(&sdata->deflink); ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (ret) return ret; done: local->monitor_chanreq = chanreq; return 0; } static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, const u8 *resp, size_t resp_len, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, struct ieee80211_link_data *link) { struct probe_resp *new, *old; if (!resp || !resp_len) return 1; old = sdata_dereference(link->u.ap.probe_resp, sdata); new = kzalloc(sizeof(struct probe_resp) + resp_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = resp_len; memcpy(new->data, resp, resp_len); if (csa) memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * sizeof(new->cntdwn_counter_offsets[0])); else if (cca) new->cntdwn_counter_offsets[0] = cca->counter_offset_presp; rcu_assign_pointer(link->u.ap.probe_resp, new); if (old) kfree_rcu(old, rcu_head); return 0; } static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, struct cfg80211_fils_discovery *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf, u64 *changed) { struct fils_discovery_data *new, *old = NULL; struct ieee80211_fils_discovery *fd; if (!params->update) return 0; fd = &link_conf->fils_discovery; fd->min_interval = params->min_interval; fd->max_interval = params->max_interval; old = sdata_dereference(link->u.ap.fils_discovery, sdata); if (old) kfree_rcu(old, rcu_head); if (params->tmpl && params->tmpl_len) { new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.fils_discovery, new); } else { RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); } *changed |= BSS_CHANGED_FILS_DISCOVERY; return 0; } static int ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_unsol_bcast_probe_resp *params, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf, u64 *changed) { struct unsol_bcast_probe_resp_data *new, *old = NULL; if (!params->update) return 0; link_conf->unsol_bcast_probe_resp_interval = params->interval; old = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); if (old) kfree_rcu(old, rcu_head); if (params->tmpl && params->tmpl_len) { new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL); if (!new) return -ENOMEM; new->len = params->tmpl_len; memcpy(new->data, params->tmpl, params->tmpl_len); rcu_assign_pointer(link->u.ap.unsol_bcast_probe_resp, new); } else { RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); } *changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP; return 0; } static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, const u8 *civicloc, size_t civicloc_len, struct ieee80211_bss_conf *link_conf) { struct ieee80211_ftm_responder_params *new, *old; u8 *pos; int len; if (!lci_len && !civicloc_len) return 0; old = link_conf->ftmr_params; len = lci_len + civicloc_len; new = kzalloc(sizeof(*new) + len, GFP_KERNEL); if (!new) return -ENOMEM; pos = (u8 *)(new + 1); if (lci_len) { new->lci_len = lci_len; new->lci = pos; memcpy(pos, lci, lci_len); pos += lci_len; } if (civicloc_len) { new->civicloc_len = civicloc_len; new->civicloc = pos; memcpy(pos, civicloc, civicloc_len); pos += civicloc_len; } link_conf->ftmr_params = new; kfree(old); return 0; } static int ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst, struct cfg80211_mbssid_elems *src) { int i, offset = 0; dst->cnt = src->cnt; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } return offset; } static int ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst, struct cfg80211_rnr_elems *src) { int i, offset = 0; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } dst->cnt = src->cnt; return offset; } static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct cfg80211_beacon_data *params, const struct ieee80211_csa_settings *csa, const struct ieee80211_color_change_settings *cca, u64 *changed) { struct cfg80211_mbssid_elems *mbssid = NULL; struct cfg80211_rnr_elems *rnr = NULL; struct beacon_data *new, *old; int new_head_len, new_tail_len; int size, err; u64 _changed = BSS_CHANGED_BEACON; struct ieee80211_bss_conf *link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); /* Need to have a beacon head if we don't have one yet */ if (!params->head && !old) return -EINVAL; /* new or old head? */ if (params->head) new_head_len = params->head_len; else new_head_len = old->head_len; /* new or old tail? */ if (params->tail || !old) /* params->tail_len will be zero for !params->tail */ new_tail_len = params->tail_len; else new_tail_len = old->tail_len; size = sizeof(*new) + new_head_len + new_tail_len; /* new or old multiple BSSID elements? */ if (params->mbssid_ies) { mbssid = params->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (params->rnr_ies) { rnr = params->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } else if (old && old->mbssid_ies) { mbssid = old->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); if (old && old->rnr_ies) { rnr = old->rnr_ies; size += struct_size(new->rnr_ies, elem, rnr->cnt); } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); } new = kzalloc(size, GFP_KERNEL); if (!new) return -ENOMEM; /* start filling the new info now */ /* * pointers go into the block we allocated, * memory is | beacon_data | head | tail | mbssid_ies | rnr_ies */ new->head = ((u8 *) new) + sizeof(*new); new->tail = new->head + new_head_len; new->head_len = new_head_len; new->tail_len = new_tail_len; /* copy in optional mbssid_ies */ if (mbssid) { u8 *pos = new->tail + new->tail_len; new->mbssid_ies = (void *)pos; pos += struct_size(new->mbssid_ies, elem, mbssid->cnt); pos += ieee80211_copy_mbssid_beacon(pos, new->mbssid_ies, mbssid); if (rnr) { new->rnr_ies = (void *)pos; pos += struct_size(new->rnr_ies, elem, rnr->cnt); ieee80211_copy_rnr_beacon(pos, new->rnr_ies, rnr); } /* update bssid_indicator */ link_conf->bssid_indicator = ilog2(__roundup_pow_of_two(mbssid->cnt + 1)); } if (csa) { new->cntdwn_current_counter = csa->count; memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * sizeof(new->cntdwn_counter_offsets[0])); } else if (cca) { new->cntdwn_current_counter = cca->count; new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon; } /* copy in head */ if (params->head) memcpy(new->head, params->head, new_head_len); else memcpy(new->head, old->head, new_head_len); /* copy in optional tail */ if (params->tail) memcpy(new->tail, params->tail, new_tail_len); else if (old) memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, params->probe_resp_len, csa, cca, link); if (err < 0) { kfree(new); return err; } if (err == 0) _changed |= BSS_CHANGED_AP_PROBE_RESP; if (params->ftm_responder != -1) { link_conf->ftm_responder = params->ftm_responder; err = ieee80211_set_ftm_responder_params(sdata, params->lci, params->lci_len, params->civicloc, params->civicloc_len, link_conf); if (err < 0) { kfree(new); return err; } _changed |= BSS_CHANGED_FTM_RESPONDER; } rcu_assign_pointer(link->u.ap.beacon, new); sdata->u.ap.active = true; if (old) kfree_rcu(old, rcu_head); *changed |= _changed; return 0; } static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata) { struct ieee80211_link_data *link; u8 link_id, num = 0; if (sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_P2P_GO) return num; if (!sdata->vif.valid_links) return num; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (sdata_dereference(link->u.ap.beacon, sdata)) num++; } return num; } static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct beacon_data *old; struct ieee80211_sub_if_data *vlan; u64 changed = BSS_CHANGED_BEACON_INT | BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_BEACON | BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER | BSS_CHANGED_TWT; int i, err; int prev_beacon_int; unsigned int link_id = params->beacon.link_id; struct ieee80211_link_data *link; struct ieee80211_bss_conf *link_conf; struct ieee80211_chan_req chanreq = { .oper = params->chandef }; lockdep_assert_wiphy(local->hw.wiphy); link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; old = sdata_dereference(link->u.ap.beacon, sdata); if (old) return -EALREADY; link->smps_mode = IEEE80211_SMPS_OFF; link->needed_rx_chains = sdata->local->rx_chains; prev_beacon_int = link_conf->beacon_int; link_conf->beacon_int = params->beacon_interval; if (params->ht_cap) link_conf->ht_ldpc = params->ht_cap->cap_info & cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING); if (params->vht_cap) { link_conf->vht_ldpc = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC); link_conf->vht_su_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); link_conf->vht_su_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); link_conf->vht_mu_beamformer = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE); link_conf->vht_mu_beamformee = params->vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); } if (params->he_cap && params->he_oper) { link_conf->he_support = true; link_conf->htc_trig_based_pkt_ext = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); link_conf->frame_time_rts_th = le32_get_bits(params->he_oper->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); changed |= BSS_CHANGED_HE_OBSS_PD; if (params->beacon.he_bss_color.enabled) changed |= BSS_CHANGED_HE_BSS_COLOR; } if (params->he_cap) { link_conf->he_ldpc = params->he_cap->phy_cap_info[1] & IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD; link_conf->he_su_beamformer = params->he_cap->phy_cap_info[3] & IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER; link_conf->he_su_beamformee = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE; link_conf->he_mu_beamformer = params->he_cap->phy_cap_info[4] & IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER; link_conf->he_full_ul_mumimo = params->he_cap->phy_cap_info[2] & IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO; } if (params->eht_cap) { if (!link_conf->he_support) return -EOPNOTSUPP; link_conf->eht_support = true; link_conf->eht_su_beamformer = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER; link_conf->eht_su_beamformee = params->eht_cap->fixed.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE; link_conf->eht_mu_beamformer = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ | IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ); link_conf->eht_80mhz_full_bw_ul_mumimo = params->eht_cap->fixed.phy_cap_info[7] & (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; link_conf->eht_mu_beamformer = false; } if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, &params->mbssid_config, link_conf); if (err) return err; } err = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); if (!err) ieee80211_link_copy_chanctx_to_vlans(link, false); if (err) { link_conf->beacon_int = prev_beacon_int; return err; } /* * Apply control port protocol, this allows us to * not encrypt dynamic WEP control frames. */ sdata->control_port_protocol = params->crypto.control_port_ethertype; sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; sdata->control_port_no_preauth = params->crypto.control_port_no_preauth; list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->control_port_protocol = params->crypto.control_port_ethertype; vlan->control_port_no_encrypt = params->crypto.control_port_no_encrypt; vlan->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; vlan->control_port_no_preauth = params->crypto.control_port_no_preauth; } link_conf->dtim_period = params->dtim_period; link_conf->enable_beacon = true; link_conf->allow_p2p_go_ps = sdata->vif.p2p; link_conf->twt_responder = params->twt_responder; link_conf->he_obss_pd = params->he_obss_pd; link_conf->he_bss_color = params->beacon.he_bss_color; sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; sdata->vif.cfg.ssid_len = params->ssid_len; if (params->ssid_len) memcpy(sdata->vif.cfg.ssid, params->ssid, params->ssid_len); link_conf->hidden_ssid = (params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE); memset(&link_conf->p2p_noa_attr, 0, sizeof(link_conf->p2p_noa_attr)); link_conf->p2p_noa_attr.oppps_ctwindow = params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; if (params->p2p_opp_ps) link_conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = params->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) link_conf->beacon_tx_rate = params->beacon_rate; err = ieee80211_assign_beacon(sdata, link, &params->beacon, NULL, NULL, &changed); if (err < 0) goto error; err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf, &changed); if (err < 0) goto error; err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf, &changed); if (err < 0) goto error; err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { old = sdata_dereference(link->u.ap.beacon, sdata); if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(link->u.ap.beacon, NULL); if (ieee80211_num_beaconing_links(sdata) == 0) sdata->u.ap.active = false; goto error; } ieee80211_recalc_dtim(local, sdata); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); ieee80211_link_info_change_notify(sdata, link, changed); if (ieee80211_num_beaconing_links(sdata) <= 1) netif_carrier_on(dev); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_on(vlan->dev); return 0; error: ieee80211_link_release_channel(link); return err; } static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_update *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct cfg80211_beacon_data *beacon = &params->beacon; struct beacon_data *old; int err; struct ieee80211_bss_conf *link_conf; u64 changed = 0; lockdep_assert_wiphy(wiphy); link = sdata_dereference(sdata->link[beacon->link_id], sdata); if (!link) return -ENOLINK; link_conf = link->conf; /* don't allow changing the beacon while a countdown is in place - offset * of channel switch counter may change */ if (link_conf->csa_active || link_conf->color_change_active) return -EBUSY; old = sdata_dereference(link->u.ap.beacon, sdata); if (!old) return -ENOENT; err = ieee80211_assign_beacon(sdata, link, beacon, NULL, NULL, &changed); if (err < 0) return err; err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery, link, link_conf, &changed); if (err < 0) return err; err = ieee80211_set_unsol_bcast_probe_resp(sdata, &params->unsol_bcast_probe_resp, link, link_conf, &changed); if (err < 0) return err; if (beacon->he_bss_color_valid && beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled; changed |= BSS_CHANGED_HE_BSS_COLOR; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static void ieee80211_free_next_beacon(struct ieee80211_link_data *link) { if (!link->u.ap.next_beacon) return; kfree(link->u.ap.next_beacon->mbssid_ies); kfree(link->u.ap.next_beacon->rnr_ies); kfree(link->u.ap.next_beacon); link->u.ap.next_beacon = NULL; } static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_sub_if_data *vlan; struct ieee80211_local *local = sdata->local; struct beacon_data *old_beacon; struct probe_resp *old_probe_resp; struct fils_discovery_data *old_fils_discovery; struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct ieee80211_bss_conf *link_conf = link->conf; LIST_HEAD(keys); lockdep_assert_wiphy(local->hw.wiphy); old_beacon = sdata_dereference(link->u.ap.beacon, sdata); if (!old_beacon) return -ENOENT; old_probe_resp = sdata_dereference(link->u.ap.probe_resp, sdata); old_fils_discovery = sdata_dereference(link->u.ap.fils_discovery, sdata); old_unsol_bcast_probe_resp = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); /* abort any running channel switch or color change */ link_conf->csa_active = false; link_conf->color_change_active = false; ieee80211_vif_unblock_queues_csa(sdata); ieee80211_free_next_beacon(link); /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); if (ieee80211_num_beaconing_links(sdata) <= 1) { netif_carrier_off(dev); sdata->u.ap.active = false; } /* remove beacon and probe response */ RCU_INIT_POINTER(link->u.ap.beacon, NULL); RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); if (old_fils_discovery) kfree_rcu(old_fils_discovery, rcu_head); if (old_unsol_bcast_probe_resp) kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; __sta_info_flush(sdata, true, link_id, NULL); ieee80211_remove_link_keys(link, &keys); if (!list_empty(&keys)) { synchronize_net(); ieee80211_free_key_list(local, &keys); } link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); if (sdata->wdev.links[link_id].cac_started) { chandef = link_conf->chanreq.oper; wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); } drv_stop_ap(sdata->local, sdata, link_conf); /* free all potentially still buffered bcast frames */ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf); ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf); ieee80211_link_copy_chanctx_to_vlans(link, true); ieee80211_link_release_channel(link); return 0; } static int sta_apply_auth_flags(struct ieee80211_local *local, struct sta_info *sta, u32 mask, u32 set) { int ret; if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && set & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && set & BIT(NL80211_STA_FLAG_ASSOCIATED) && !test_sta_flag(sta, WLAN_STA_ASSOC)) { /* * When peer becomes associated, init rate control as * well. Some drivers require rate control initialized * before drv_sta_state() is called. */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init_all_links(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); else ret = 0; if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && !(set & BIT(NL80211_STA_FLAG_ASSOCIATED)) && test_sta_flag(sta, WLAN_STA_ASSOC)) { ret = sta_info_move_state(sta, IEEE80211_STA_AUTH); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) && !(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) && test_sta_flag(sta, WLAN_STA_AUTH)) { ret = sta_info_move_state(sta, IEEE80211_STA_NONE); if (ret) return ret; } return 0; } static void sta_apply_mesh_params(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { #ifdef CONFIG_MAC80211_MESH struct ieee80211_sub_if_data *sdata = sta->sdata; u64 changed = 0; if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { case NL80211_PLINK_ESTAB: if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, sdata->u.mesh.mshcfg.power_mode); ewma_mesh_tx_rate_avg_init(&sta->mesh->tx_rate_avg); /* init at low value */ ewma_mesh_tx_rate_avg_add(&sta->mesh->tx_rate_avg, 10); break; case NL80211_PLINK_LISTEN: case NL80211_PLINK_BLOCKED: case NL80211_PLINK_OPN_SNT: case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: case NL80211_PLINK_HOLDING: if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->mesh->plink_state = params->plink_state; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, NL80211_MESH_POWER_UNKNOWN); break; default: /* nothing */ break; } } switch (params->plink_action) { case NL80211_PLINK_ACTION_NO_ACTION: /* nothing */ break; case NL80211_PLINK_ACTION_OPEN: changed |= mesh_plink_open(sta); break; case NL80211_PLINK_ACTION_BLOCK: changed |= mesh_plink_block(sta); break; } if (params->local_pm) changed |= ieee80211_mps_set_sta_local_pm(sta, params->local_pm); ieee80211_mbss_info_change_notify(sdata, changed); #endif } enum sta_link_apply_mode { STA_LINK_MODE_NEW, STA_LINK_MODE_STA_MODIFY, STA_LINK_MODE_LINK_MODIFY, }; static int sta_link_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, enum sta_link_apply_mode mode, struct link_station_parameters *params) { struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; u32 link_id = params->link_id < 0 ? 0 : params->link_id; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct link_sta_info *link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); bool changes = params->link_mac || params->txpwr_set || params->supported_rates_len || params->ht_capa || params->vht_capa || params->he_capa || params->eht_capa || params->opmode_notif_used; switch (mode) { case STA_LINK_MODE_NEW: if (!params->link_mac) return -EINVAL; break; case STA_LINK_MODE_LINK_MODIFY: break; case STA_LINK_MODE_STA_MODIFY: if (params->link_id >= 0) break; if (!changes) return 0; break; } if (!link || !link_sta) return -EINVAL; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->link_mac) { if (mode == STA_LINK_MODE_NEW) { memcpy(link_sta->addr, params->link_mac, ETH_ALEN); memcpy(link_sta->pub->addr, params->link_mac, ETH_ALEN); } else if (!ether_addr_equal(link_sta->addr, params->link_mac)) { return -EINVAL; } } if (params->txpwr_set) { int ret; link_sta->pub->txpwr.type = params->txpwr.type; if (params->txpwr.type == NL80211_TX_POWER_LIMITED) link_sta->pub->txpwr.power = params->txpwr.power; ret = drv_sta_set_txpwr(local, sdata, sta); if (ret) return ret; } if (params->supported_rates && params->supported_rates_len) { ieee80211_parse_bitrates(link->conf->chanreq.oper.width, sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band]); } if (params->ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, params->ht_capa, link_sta); /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, NULL, link_sta); if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, params->he_capa_len, (void *)params->he_6ghz_capa, link_sta); if (params->he_capa && params->eht_capa) ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, (u8 *)params->he_capa, params->he_capa_len, params->eht_capa, params->eht_capa_len, link_sta); ieee80211_sta_init_nss(link_sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ __ieee80211_vht_handle_opmode(sdata, link_sta, params->opmode_notif, sband->band); } return 0; } static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 mask, set; int ret = 0; mask = params->sta_flags_mask; set = params->sta_flags_set; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* * In mesh mode, ASSOCIATED isn't part of the nl80211 * API but must follow AUTHENTICATED for driver state. */ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) mask |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) set |= BIT(NL80211_STA_FLAG_ASSOCIATED); } else if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* * TDLS -- everything follows authorized, but * only becoming authorized is possible, not * going back */ if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) { set |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); mask |= BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED); } } if (mask & BIT(NL80211_STA_FLAG_WME) && local->hw.queues >= IEEE80211_NUM_ACS) sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); /* auth flags will be set later for TDLS, * and for unassociated stations that move to associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) set_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); else clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } if (mask & BIT(NL80211_STA_FLAG_MFP)) { sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP)); if (set & BIT(NL80211_STA_FLAG_MFP)) set_sta_flag(sta, WLAN_STA_MFP); else clear_sta_flag(sta, WLAN_STA_MFP); } if (mask & BIT(NL80211_STA_FLAG_TDLS_PEER)) { if (set & BIT(NL80211_STA_FLAG_TDLS_PEER)) set_sta_flag(sta, WLAN_STA_TDLS_PEER); else clear_sta_flag(sta, WLAN_STA_TDLS_PEER); } if (mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) sta->sta.spp_amsdu = set & BIT(NL80211_STA_FLAG_SPP_AMSDU); /* mark TDLS channel switch support, if the AP allows it */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->deflink.u.mgd.tdls_chan_switch_prohibited && params->ext_capab_len >= 4 && params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH) set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->u.mgd.tdls_wider_bw_prohibited && ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && params->ext_capab_len >= 8 && params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW); if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) { sta->sta.uapsd_queues = params->uapsd_queues; sta->sta.max_sp = params->max_sp; } ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, params->ext_capab_len); /* * cfg80211 validates this (1-2007) and allows setting the AID * only when creating a new station entry */ if (params->aid) sta->sta.aid = params->aid; /* * Some of the following updates would be racy if called on an * existing station, via ieee80211_change_station(). However, * all such changes are rejected by cfg80211 except for updates * changing the supported rates on an existing but not yet used * TDLS peer. */ if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY, &params->link_sta_params); if (ret) return ret; if (params->support_p2p_ps >= 0) sta->sta.support_p2p_ps = params->support_p2p_ps; if (ieee80211_vif_is_mesh(&sdata->vif)) sta_apply_mesh_params(local, sta, params); if (params->airtime_weight) sta->airtime_weight = params->airtime_weight; /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) || set & BIT(NL80211_STA_FLAG_ASSOCIATED)) { ret = sta_apply_auth_flags(local, sta, mask, set); if (ret) return ret; } /* Mark the STA as MLO if MLD MAC address is available */ if (params->link_sta_params.mld_mac) sta->sta.mlo = true; return 0; } static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *sdata; int err; lockdep_assert_wiphy(local->hw.wiphy); if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_AP) return -EINVAL; } else sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (ether_addr_equal(mac, sdata->vif.addr)) return -EINVAL; if (!is_valid_ether_addr(mac)) return -EINVAL; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) && sdata->vif.type == NL80211_IFTYPE_STATION && !sdata->u.mgd.associated) return -EINVAL; /* * If we have a link ID, it can be a non-MLO station on an AP MLD, * but we need to have a link_mac in that case as well, so use the * STA's MAC address in that case. */ if (params->link_sta_params.link_id >= 0) sta = sta_info_alloc_with_link(sdata, mac, params->link_sta_params.link_id, params->link_sta_params.link_mac ?: mac, GFP_KERNEL); else sta = sta_info_alloc(sdata, mac, GFP_KERNEL); if (!sta) return -ENOMEM; if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; /* Though the mutex is not needed here (since the station is not * visible yet), sta_apply_parameters (and inner functions) require * the mutex due to other paths. */ err = sta_apply_parameters(local, sta, params); if (err) { sta_info_free(local, sta); return err; } /* * for TDLS and for unassociated station, rate control should be * initialized only when rates are known and station is marked * authorized/associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_ASSOC)) rate_control_rate_init_all_links(sta); return sta_info_insert(sta); } static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, struct station_del_parameters *params) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (params->mac) return sta_info_destroy_addr_bss(sdata, params->mac); sta_info_flush(sdata, params->link_id); return 0; } static int ieee80211_change_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; struct ieee80211_sub_if_data *vlansdata; enum cfg80211_station_type statype; int err; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, mac); if (!sta) return -ENOENT; switch (sdata->vif.type) { case NL80211_IFTYPE_MESH_POINT: if (sdata->u.mesh.user_mpm) statype = CFG80211_STA_MESH_PEER_USER; else statype = CFG80211_STA_MESH_PEER_KERNEL; break; case NL80211_IFTYPE_ADHOC: statype = CFG80211_STA_IBSS; break; case NL80211_IFTYPE_STATION: if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { statype = CFG80211_STA_AP_STA; break; } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) statype = CFG80211_STA_TDLS_PEER_ACTIVE; else statype = CFG80211_STA_TDLS_PEER_SETUP; break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: if (test_sta_flag(sta, WLAN_STA_ASSOC)) statype = CFG80211_STA_AP_CLIENT; else statype = CFG80211_STA_AP_CLIENT_UNASSOC; break; default: return -EOPNOTSUPP; } err = cfg80211_check_station_change(wiphy, params, statype); if (err) return err; if (params->vlan && params->vlan != sta->sdata->dev) { vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); if (params->vlan->ieee80211_ptr->use_4addr) { if (vlansdata->u.vlan.sta) return -EBUSY; rcu_assign_pointer(vlansdata->u.vlan.sta, sta); __ieee80211_check_fast_rx_iface(vlansdata); drv_sta_set_4addr(local, sta->sdata, &sta->sta, true); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sta->sdata->u.vlan.sta) RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_dec_num_mcast(sta->sdata); sta->sdata = vlansdata; ieee80211_check_fast_rx(sta); ieee80211_check_fast_xmit(sta); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { ieee80211_vif_inc_num_mcast(sta->sdata); cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); } } err = sta_apply_parameters(local, sta, params); if (err) return err; if (sdata->vif.type == NL80211_IFTYPE_STATION && params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) { ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } return 0; } #ifdef CONFIG_MAC80211_MESH static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_add(sdata, dst); if (IS_ERR(mpath)) { rcu_read_unlock(); return PTR_ERR(mpath); } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (dst) return mesh_path_del(sdata, dst); mesh_path_flush_by_iface(sdata); return 0; } static int ieee80211_change_mpath(struct wiphy *wiphy, struct net_device *dev, const u8 *dst, const u8 *next_hop) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; struct sta_info *sta; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); sta = sta_info_get(sdata, next_hop); if (!sta) { rcu_read_unlock(); return -ENOENT; } mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } mesh_path_fix_nexthop(mpath, sta); rcu_read_unlock(); return 0; } static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, struct mpath_info *pinfo) { struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop); if (next_hop_sta) memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN); else eth_zero_addr(next_hop); memset(pinfo, 0, sizeof(*pinfo)); pinfo->generation = mpath->sdata->u.mesh.mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | MPATH_INFO_SN | MPATH_INFO_METRIC | MPATH_INFO_EXPTIME | MPATH_INFO_DISCOVERY_TIMEOUT | MPATH_INFO_DISCOVERY_RETRIES | MPATH_INFO_FLAGS | MPATH_INFO_HOP_COUNT | MPATH_INFO_PATH_CHANGE; pinfo->frame_qlen = mpath->frame_queue.qlen; pinfo->sn = mpath->sn; pinfo->metric = mpath->metric; if (time_before(jiffies, mpath->exp_time)) pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies); pinfo->discovery_timeout = jiffies_to_msecs(mpath->discovery_timeout); pinfo->discovery_retries = mpath->discovery_retries; if (mpath->flags & MESH_PATH_ACTIVE) pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; if (mpath->flags & MESH_PATH_RESOLVING) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; if (mpath->flags & MESH_PATH_SN_VALID) pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; if (mpath->flags & MESH_PATH_FIXED) pinfo->flags |= NL80211_MPATH_FLAG_FIXED; if (mpath->flags & MESH_PATH_RESOLVED) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; pinfo->hop_count = mpath->hop_count; pinfo->path_change_count = mpath->path_change_count; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mesh_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpath_set_pinfo(mpath, next_hop, pinfo); rcu_read_unlock(); return 0; } static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp, struct mpath_info *pinfo) { memset(pinfo, 0, sizeof(*pinfo)); memcpy(mpp, mpath->mpp, ETH_ALEN); pinfo->generation = mpath->sdata->u.mesh.mpp_paths_generation; } static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup(sdata, dst); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_dump_mpp(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { struct ieee80211_sub_if_data *sdata; struct mesh_path *mpath; sdata = IEEE80211_DEV_TO_SUB_IF(dev); rcu_read_lock(); mpath = mpp_path_lookup_by_idx(sdata, idx); if (!mpath) { rcu_read_unlock(); return -ENOENT; } memcpy(dst, mpath->dst, ETH_ALEN); mpp_set_pinfo(mpath, mpp, pinfo); rcu_read_unlock(); return 0; } static int ieee80211_get_mesh_config(struct wiphy *wiphy, struct net_device *dev, struct mesh_config *conf) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config)); return 0; } static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask) { return (mask >> (parm-1)) & 0x1; } static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const struct mesh_setup *setup) { u8 *new_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); int i; /* allocate information elements */ new_ie = NULL; if (setup->ie_len) { new_ie = kmemdup(setup->ie, setup->ie_len, GFP_KERNEL); if (!new_ie) return -ENOMEM; } ifmsh->ie_len = setup->ie_len; ifmsh->ie = new_ie; /* now copy the rest of the setup parameters */ ifmsh->mesh_id_len = setup->mesh_id_len; memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len); ifmsh->mesh_sp_id = setup->sync_method; ifmsh->mesh_pp_id = setup->path_sel_proto; ifmsh->mesh_pm_id = setup->path_metric; ifmsh->user_mpm = setup->user_mpm; ifmsh->mesh_auth_id = setup->auth_id; ifmsh->security = IEEE80211_MESH_SEC_NONE; ifmsh->userspace_handles_dfs = setup->userspace_handles_dfs; if (setup->is_authenticated) ifmsh->security |= IEEE80211_MESH_SEC_AUTHED; if (setup->is_secure) ifmsh->security |= IEEE80211_MESH_SEC_SECURED; /* mcast rate setting in Mesh Node */ memcpy(sdata->vif.bss_conf.mcast_rate, setup->mcast_rate, sizeof(setup->mcast_rate)); sdata->vif.bss_conf.basic_rates = setup->basic_rates; sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; sdata->beacon_rate_set = false; if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { for (i = 0; i < NUM_NL80211_BANDS; i++) { sdata->beacon_rateidx_mask[i] = setup->beacon_rate.control[i].legacy; if (sdata->beacon_rateidx_mask[i]) sdata->beacon_rate_set = true; } } return 0; } static int ieee80211_update_mesh_config(struct wiphy *wiphy, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { struct mesh_config *conf; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_mesh *ifmsh; sdata = IEEE80211_DEV_TO_SUB_IF(dev); ifmsh = &sdata->u.mesh; /* Set the config options which we are interested in setting */ conf = &(sdata->u.mesh.mshcfg); if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask)) conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask)) conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask)) conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask)) conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks; if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask)) conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries; if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask)) conf->dot11MeshTTL = nconf->dot11MeshTTL; if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask)) conf->element_ttl = nconf->element_ttl; if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) { if (ifmsh->user_mpm) return -EBUSY; conf->auto_open_plinks = nconf->auto_open_plinks; } if (_chg_mesh_attr(NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, mask)) conf->dot11MeshNbrOffsetMaxNeighbor = nconf->dot11MeshNbrOffsetMaxNeighbor; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask)) conf->dot11MeshHWMPmaxPREQretries = nconf->dot11MeshHWMPmaxPREQretries; if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask)) conf->path_refresh_time = nconf->path_refresh_time; if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask)) conf->min_discovery_timeout = nconf->min_discovery_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathTimeout = nconf->dot11MeshHWMPactivePathTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask)) conf->dot11MeshHWMPpreqMinInterval = nconf->dot11MeshHWMPpreqMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, mask)) conf->dot11MeshHWMPperrMinInterval = nconf->dot11MeshHWMPperrMinInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME, mask)) conf->dot11MeshHWMPnetDiameterTraversalTime = nconf->dot11MeshHWMPnetDiameterTraversalTime; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) { conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode; ieee80211_mesh_root_setup(ifmsh); } if (_chg_mesh_attr(NL80211_MESHCONF_GATE_ANNOUNCEMENTS, mask)) { /* our current gate announcement implementation rides on root * announcements, so require this ifmsh to also be a root node * */ if (nconf->dot11MeshGateAnnouncementProtocol && !(conf->dot11MeshHWMPRootMode > IEEE80211_ROOTMODE_ROOT)) { conf->dot11MeshHWMPRootMode = IEEE80211_PROACTIVE_RANN; ieee80211_mesh_root_setup(ifmsh); } conf->dot11MeshGateAnnouncementProtocol = nconf->dot11MeshGateAnnouncementProtocol; } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_RANN_INTERVAL, mask)) conf->dot11MeshHWMPRannInterval = nconf->dot11MeshHWMPRannInterval; if (_chg_mesh_attr(NL80211_MESHCONF_FORWARDING, mask)) conf->dot11MeshForwarding = nconf->dot11MeshForwarding; if (_chg_mesh_attr(NL80211_MESHCONF_RSSI_THRESHOLD, mask)) { /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -EOPNOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) { conf->ht_opmode = nconf->ht_opmode; sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_HT); } if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask)) conf->dot11MeshHWMPactivePathToRootTimeout = nconf->dot11MeshHWMPactivePathToRootTimeout; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOT_INTERVAL, mask)) conf->dot11MeshHWMProotInterval = nconf->dot11MeshHWMProotInterval; if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, mask)) conf->dot11MeshHWMPconfirmationInterval = nconf->dot11MeshHWMPconfirmationInterval; if (_chg_mesh_attr(NL80211_MESHCONF_POWER_MODE, mask)) { conf->power_mode = nconf->power_mode; ieee80211_mps_local_status_update(sdata); } if (_chg_mesh_attr(NL80211_MESHCONF_AWAKE_WINDOW, mask)) conf->dot11MeshAwakeWindowDuration = nconf->dot11MeshAwakeWindowDuration; if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask)) conf->plink_timeout = nconf->plink_timeout; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) conf->dot11MeshConnectedToAuthServer = nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = setup->chandef }; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config)); err = copy_mesh_setup(ifmsh, setup); if (err) return err; sdata->control_port_over_nl80211 = setup->control_port_over_nl80211; /* can mesh use other SMPS modes? */ sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; return ieee80211_start_mesh(sdata); } static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); ieee80211_stop_mesh(sdata); ieee80211_link_release_channel(&sdata->deflink); kfree(sdata->u.mesh.ie); return 0; } #endif static int ieee80211_change_bss(struct wiphy *wiphy, struct net_device *dev, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; u64 changed = 0; link = ieee80211_link_or_deflink(sdata, params->link_id, true); if (IS_ERR(link)) return PTR_ERR(link); if (!sdata_dereference(link->u.ap.beacon, sdata)) return -ENOENT; sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->basic_rates) { if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, &link->conf->basic_rates)) return -EINVAL; changed |= BSS_CHANGED_BASIC_RATES; ieee80211_check_rate_mask(link); } if (params->use_cts_prot >= 0) { link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (params->use_short_preamble >= 0) { link->conf->use_short_preamble = params->use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (!link->conf->use_short_slot && (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ)) { link->conf->use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } if (params->use_short_slot_time >= 0) { link->conf->use_short_slot = params->use_short_slot_time; changed |= BSS_CHANGED_ERP_SLOT; } if (params->ap_isolate >= 0) { if (params->ap_isolate) sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; else sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS; ieee80211_check_fast_rx_iface(sdata); } if (params->ht_opmode >= 0) { link->conf->ht_operation_mode = (u16)params->ht_opmode; changed |= BSS_CHANGED_HT; } if (params->p2p_ctwindow >= 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; link->conf->p2p_noa_attr.oppps_ctwindow |= params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } if (params->p2p_opp_ps > 0) { link->conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } else if (params->p2p_opp_ps == 0) { link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } ieee80211_link_info_change_notify(sdata, link, changed); return 0; } static int ieee80211_set_txq_params(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_txq_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link = ieee80211_link_or_deflink(sdata, params->link_id, true); struct ieee80211_tx_queue_params p; if (!local->ops->conf_tx) return -EOPNOTSUPP; if (local->hw.queues < IEEE80211_NUM_ACS) return -EOPNOTSUPP; if (IS_ERR(link)) return PTR_ERR(link); memset(&p, 0, sizeof(p)); p.aifs = params->aifs; p.cw_max = params->cwmax; p.cw_min = params->cwmin; p.txop = params->txop; /* * Setting tx queue params disables u-apsd because it's only * called in master mode. */ p.uapsd = false; ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac); link->tx_conf[params->ac] = p; if (drv_conf_tx(local, link, params->ac, &p)) { wiphy_debug(local->hw.wiphy, "failed to set TX queue parameters for AC %d\n", params->ac); return -EINVAL; } ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); return 0; } #ifdef CONFIG_PM static int ieee80211_suspend(struct wiphy *wiphy, struct cfg80211_wowlan *wowlan) { return __ieee80211_suspend(wiphy_priv(wiphy), wowlan); } static int ieee80211_resume(struct wiphy *wiphy) { return __ieee80211_resume(wiphy_priv(wiphy)); } #else #define ieee80211_suspend NULL #define ieee80211_resume NULL #endif static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); switch (ieee80211_vif_type_p2p(&sdata->vif)) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_DEVICE: break; case NL80211_IFTYPE_P2P_GO: if (sdata->local->ops->hw_scan) break; /* * FIXME: implement NoA while scanning in software, * for now fall through to allow scanning only when * beaconing hasn't been configured yet */ fallthrough; case NL80211_IFTYPE_AP: /* * If the scan has been forced (and the driver supports * forcing), don't care about being beaconing already. * This will create problems to the attached stations (e.g. all * the frames sent while scanning on other channel will be * lost) */ if (sdata->deflink.u.ap.beacon && (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } return ieee80211_request_scan(sdata, req); } static void ieee80211_abort_scan(struct wiphy *wiphy, struct wireless_dev *wdev) { ieee80211_scan_cancel(wiphy_priv(wiphy)); } static int ieee80211_sched_scan_start(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_sched_scan_request *req) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!sdata->local->ops->sched_scan_start) return -EOPNOTSUPP; return ieee80211_request_sched_scan_start(sdata, req); } static int ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev, u64 reqid) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; return ieee80211_request_sched_scan_stop(local); } static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req) { return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_assoc_request *req) { return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_deauth_request *req) { return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_disassoc_request *req) { return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev), req); } static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ibss_params *params) { return ieee80211_ibss_join(IEEE80211_DEV_TO_SUB_IF(dev), params); } static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ibss_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_join_ocb(struct wiphy *wiphy, struct net_device *dev, struct ocb_setup *setup) { return ieee80211_ocb_join(IEEE80211_DEV_TO_SUB_IF(dev), setup); } static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev) { return ieee80211_ocb_leave(IEEE80211_DEV_TO_SUB_IF(dev)); } static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); if (ieee80211_sdata_running(sdata)) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_MCAST_RATE); return 0; } static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); err = drv_set_frag_threshold(local, wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); return err; } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || (changed & WIPHY_PARAM_DYN_ACK)) { s16 coverage_class; coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; err = drv_set_coverage_class(local, coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { err = drv_set_rts_threshold(local, wiphy->rts_threshold); if (err) return err; } if (changed & WIPHY_PARAM_RETRY_SHORT) { if (wiphy->retry_short > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; } if (changed & WIPHY_PARAM_RETRY_LONG) { if (wiphy->retry_long > IEEE80211_MAX_TX_RETRY) return -EINVAL; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) ieee80211_txq_set_params(local); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; enum nl80211_tx_power_setting txp_type = type; bool update_txp_type = false; bool has_monitor = false; int user_power_level; int old_power = local->user_power_level; lockdep_assert_wiphy(local->hw.wiphy); switch (type) { case NL80211_TX_POWER_AUTOMATIC: user_power_level = IEEE80211_UNSET_POWER_LEVEL; txp_type = NL80211_TX_POWER_LIMITED; break; case NL80211_TX_POWER_LIMITED: case NL80211_TX_POWER_FIXED: if (mbm < 0 || (mbm % 100)) return -EOPNOTSUPP; user_power_level = MBM_TO_DBM(mbm); break; default: return -EINVAL; } if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return -EOPNOTSUPP; sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) return -EOPNOTSUPP; } for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; link->user_power_level = user_power_level; if (txp_type != link->conf->txpower_type) { update_txp_type = true; link->conf->txpower_type = txp_type; } ieee80211_recalc_txpower(link, update_txp_type); } return 0; } local->user_power_level = user_power_level; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { has_monitor = true; continue; } for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; link->user_power_level = local->user_power_level; if (txp_type != link->conf->txpower_type) update_txp_type = true; link->conf->txpower_type = txp_type; } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; for (int link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) continue; ieee80211_recalc_txpower(link, update_txp_type); } } if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); } } if (local->emulate_chanctx && (old_power != local->user_power_level)) ieee80211_hw_conf_chan(local); return 0; } static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id, int *dbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_link_data *link_data; if (local->ops->get_txpower && (sdata->flags & IEEE80211_SDATA_IN_DRIVER)) return drv_get_txpower(local, sdata, link_id, dbm); if (local->emulate_chanctx) { *dbm = local->hw.conf.power_level; } else { link_data = wiphy_dereference(wiphy, sdata->link[link_id]); if (link_data) *dbm = link_data->conf->txpower; else return -ENOLINK; } /* INT_MIN indicates no power level was set yet */ if (*dbm == INT_MIN) return -EINVAL; return 0; } static void ieee80211_rfkill_poll(struct wiphy *wiphy) { struct ieee80211_local *local = wiphy_priv(wiphy); drv_rfkill_poll(local); } #ifdef CONFIG_NL80211_TESTMODE static int ieee80211_testmode_cmd(struct wiphy *wiphy, struct wireless_dev *wdev, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_vif *vif = NULL; if (!local->ops->testmode_cmd) return -EOPNOTSUPP; if (wdev) { struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->flags & IEEE80211_SDATA_IN_DRIVER) vif = &sdata->vif; } return local->ops->testmode_cmd(&local->hw, vif, data, len); } static int ieee80211_testmode_dump(struct wiphy *wiphy, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->testmode_dump) return -EOPNOTSUPP; return local->ops->testmode_dump(&local->hw, skb, cb, data, len); } #endif int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, enum ieee80211_smps_mode smps_mode) { const u8 *ap; enum ieee80211_smps_mode old_req; int err; struct sta_info *sta; bool tdls_peer_found = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return -EINVAL; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; old_req = link->u.mgd.req_smps; link->u.mgd.req_smps = smps_mode; /* The driver indicated that EML is enabled for the interface, which * implies that SMPS flows towards the AP should be stopped. */ if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE) return 0; if (old_req == smps_mode && smps_mode != IEEE80211_SMPS_AUTOMATIC) return 0; /* * If not associated, or current association is not an HT * association, there's no need to do anything, just store * the new value until we associate. */ if (!sdata->u.mgd.associated || link->conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT) return 0; ap = sdata->vif.cfg.ap_addr; rcu_read_lock(); list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) continue; tdls_peer_found = true; break; } rcu_read_unlock(); if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { if (tdls_peer_found || !sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_OFF; else smps_mode = IEEE80211_SMPS_DYNAMIC; } /* send SM PS frame to AP */ err = ieee80211_send_smps_action(sdata, smps_mode, ap, ap, ieee80211_vif_is_mld(&sdata->vif) ? link->link_id : -1); if (err) link->u.mgd.req_smps = old_req; else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) ieee80211_teardown_tdls_peers(link); return err; } static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, bool enabled, int timeout) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); unsigned int link_id; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && timeout == local->dynamic_ps_forced_timeout) return 0; sdata->u.mgd.powersave = enabled; local->dynamic_ps_forced_timeout = timeout; /* no change, but if automatic follow powersave */ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; __ieee80211_request_smps_mgd(sdata, link, link->u.mgd.req_smps); } if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); ieee80211_check_fast_rx_iface(sdata); return 0; } static void ieee80211_set_cqm_rssi_link(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, s32 rssi_thold, u32 rssi_hyst, s32 rssi_low, s32 rssi_high) { struct ieee80211_bss_conf *conf; if (!link || !link->conf) return; conf = link->conf; if (rssi_thold && rssi_hyst && rssi_thold == conf->cqm_rssi_thold && rssi_hyst == conf->cqm_rssi_hyst) return; conf->cqm_rssi_thold = rssi_thold; conf->cqm_rssi_hyst = rssi_hyst; conf->cqm_rssi_low = rssi_low; conf->cqm_rssi_high = rssi_high; link->u.mgd.last_cqm_event_signal = 0; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; if (sdata->u.mgd.associated && (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_CQM); } static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; int link_id; if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER && !(vif->driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) return -EOPNOTSUPP; /* For MLD, handle CQM change on all the active links */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); ieee80211_set_cqm_rssi_link(sdata, link, rssi_thold, rssi_hyst, 0, 0); } return 0; } static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_low, s32 rssi_high) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; int link_id; if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER) return -EOPNOTSUPP; /* For MLD, handle CQM change on all the active links */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); ieee80211_set_cqm_rssi_link(sdata, link, 0, 0, rssi_low, rssi_high); } return 0; } static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, const u8 *addr, const struct cfg80211_bitrate_mask *mask) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); int i, ret; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; /* * If active validate the setting and reject it if it doesn't leave * at least one basic rate usable, since we really have to be able * to send something, and if we're an AP we have to be able to do * so at a basic rate so that all clients can receive it. */ if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) && sdata->vif.bss_conf.chanreq.oper.chan) { u32 basic_rates = sdata->vif.bss_conf.basic_rates; enum nl80211_band band; band = sdata->vif.bss_conf.chanreq.oper.chan->band; if (!(mask->control[band].legacy & basic_rates)) return -EINVAL; } if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; } for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, sizeof(mask->control[i].ht_mcs)); memcpy(sdata->rc_rateidx_vht_mcs_mask[i], mask->control[i].vht_mcs, sizeof(mask->control[i].vht_mcs)); sdata->rc_has_mcs_mask[i] = false; sdata->rc_has_vht_mcs_mask[i] = false; if (!sband) continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) { sdata->rc_has_mcs_mask[i] = true; break; } } for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) { sdata->rc_has_vht_mcs_mask[i] = true; break; } } } return 0; } static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = *chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; link_data = sdata_dereference(sdata->link[link_id], sdata); if (!link_data) return -ENOLINK; /* whatever, but channel contexts should not complain about that one */ link_data->smps_mode = IEEE80211_SMPS_OFF; link_data->needed_rx_chains = local->rx_chains; err = ieee80211_link_use_channel(link_data, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work, msecs_to_jiffies(cac_time_ms)); return 0; } static void ieee80211_end_cac(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { link_data = sdata_dereference(sdata->link[link_id], sdata); if (!link_data) continue; wiphy_delayed_work_cancel(wiphy, &link_data->dfs_cac_timer_work); if (sdata->wdev.links[link_id].cac_started) { ieee80211_link_release_channel(link_data); sdata->wdev.links[link_id].cac_started = false; } } } static struct cfg80211_beacon_data * cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) { struct cfg80211_beacon_data *new_beacon; u8 *pos; int len; len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + beacon->proberesp_ies_len + beacon->assocresp_ies_len + beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len; if (beacon->mbssid_ies) len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies, beacon->rnr_ies, beacon->mbssid_ies->cnt); new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); if (!new_beacon) return NULL; if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { new_beacon->mbssid_ies = kzalloc(struct_size(new_beacon->mbssid_ies, elem, beacon->mbssid_ies->cnt), GFP_KERNEL); if (!new_beacon->mbssid_ies) { kfree(new_beacon); return NULL; } if (beacon->rnr_ies && beacon->rnr_ies->cnt) { new_beacon->rnr_ies = kzalloc(struct_size(new_beacon->rnr_ies, elem, beacon->rnr_ies->cnt), GFP_KERNEL); if (!new_beacon->rnr_ies) { kfree(new_beacon->mbssid_ies); kfree(new_beacon); return NULL; } } } pos = (u8 *)(new_beacon + 1); if (beacon->head_len) { new_beacon->head_len = beacon->head_len; new_beacon->head = pos; memcpy(pos, beacon->head, beacon->head_len); pos += beacon->head_len; } if (beacon->tail_len) { new_beacon->tail_len = beacon->tail_len; new_beacon->tail = pos; memcpy(pos, beacon->tail, beacon->tail_len); pos += beacon->tail_len; } if (beacon->beacon_ies_len) { new_beacon->beacon_ies_len = beacon->beacon_ies_len; new_beacon->beacon_ies = pos; memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len); pos += beacon->beacon_ies_len; } if (beacon->proberesp_ies_len) { new_beacon->proberesp_ies_len = beacon->proberesp_ies_len; new_beacon->proberesp_ies = pos; memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len); pos += beacon->proberesp_ies_len; } if (beacon->assocresp_ies_len) { new_beacon->assocresp_ies_len = beacon->assocresp_ies_len; new_beacon->assocresp_ies = pos; memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len); pos += beacon->assocresp_ies_len; } if (beacon->probe_resp_len) { new_beacon->probe_resp_len = beacon->probe_resp_len; new_beacon->probe_resp = pos; memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); pos += beacon->probe_resp_len; } if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { pos += ieee80211_copy_mbssid_beacon(pos, new_beacon->mbssid_ies, beacon->mbssid_ies); if (beacon->rnr_ies && beacon->rnr_ies->cnt) pos += ieee80211_copy_rnr_beacon(pos, new_beacon->rnr_ies, beacon->rnr_ies); } /* might copy -1, meaning no changes requested */ new_beacon->ftm_responder = beacon->ftm_responder; if (beacon->lci) { new_beacon->lci_len = beacon->lci_len; new_beacon->lci = pos; memcpy(pos, beacon->lci, beacon->lci_len); pos += beacon->lci_len; } if (beacon->civicloc) { new_beacon->civicloc_len = beacon->civicloc_len; new_beacon->civicloc = pos; memcpy(pos, beacon->civicloc, beacon->civicloc_len); pos += beacon->civicloc_len; } return new_beacon; } void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_link_data *link_data; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link_data = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link_data)) { rcu_read_unlock(); return; } /* TODO: MBSSID with MLO changes */ if (vif->mbssid_tx_vif == vif) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ struct ieee80211_sub_if_data *iter; list_for_each_entry_rcu(iter, &local->interfaces, list) { if (!ieee80211_sdata_running(iter)) continue; if (iter == sdata || iter->vif.mbssid_tx_vif != vif) continue; wiphy_work_queue(iter->local->hw.wiphy, &iter->deflink.csa.finalize_work); } } wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_csa_finish); void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; sdata_info(sdata, "channel switch failed, disconnecting\n"); wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); static int ieee80211_set_after_csa_beacon(struct ieee80211_link_data *link_data, u64 *changed) { struct ieee80211_sub_if_data *sdata = link_data->sdata; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: if (!link_data->u.ap.next_beacon) return -EINVAL; err = ieee80211_assign_beacon(sdata, link_data, link_data->u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(link_data); if (err < 0) return err; break; case NL80211_IFTYPE_ADHOC: err = ieee80211_ibss_finish_csa(sdata, changed); if (err < 0) return err; break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: err = ieee80211_mesh_finish_csa(sdata, changed); if (err < 0) return err; break; #endif default: WARN_ON(1); return -EINVAL; } return 0; } static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *link_conf = link_data->conf; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (link_data->reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (link_data->reserved_ready) return 0; return ieee80211_link_use_reserved_context(link_data); } if (!cfg80211_chandef_identical(&link_conf->chanreq.oper, &link_data->csa.chanreq.oper)) return -EINVAL; link_conf->csa_active = false; err = ieee80211_set_after_csa_beacon(link_data, &changed); if (err) return err; ieee80211_link_info_change_notify(sdata, link_data, changed); ieee80211_vif_unblock_queues_csa(sdata); err = drv_post_channel_switch(link_data); if (err) return err; cfg80211_ch_switch_notify(sdata->dev, &link_data->csa.chanreq.oper, link_data->link_id); return 0; } static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; if (__ieee80211_csa_finalize(link_data)) { sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n", link_data->link_id); cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, GFP_KERNEL); } } void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, csa.finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ if (!link->conf->csa_active) return; if (!ieee80211_sdata_running(sdata)) return; ieee80211_csa_finalize(link); } static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data, struct cfg80211_csa_settings *params, u64 *changed) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_csa_settings csa = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: link_data->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_after); if (!link_data->u.ap.next_beacon) return -ENOMEM; /* * With a count of 0, we don't have to wait for any * TBTT before switching, so complete the CSA * immediately. In theory, with a count == 1 we * should delay the switch until just before the next * TBTT, but that would complicate things so we switch * immediately too. If we would delay the switch * until the next TBTT, we would have to set the probe * response here. * * TODO: A channel switch with count <= 1 without * sending a CSA action frame is kind of useless, * because the clients won't know we're changing * channels. The action frame must be implemented * either here or in the userspace. */ if (params->count <= 1) break; if ((params->n_counter_offsets_beacon > IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) { ieee80211_free_next_beacon(link_data); return -EINVAL; } csa.counter_offsets_beacon = params->counter_offsets_beacon; csa.counter_offsets_presp = params->counter_offsets_presp; csa.n_counter_offsets_beacon = params->n_counter_offsets_beacon; csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; err = ieee80211_assign_beacon(sdata, link_data, &params->beacon_csa, &csa, NULL, changed); if (err < 0) { ieee80211_free_next_beacon(link_data); return err; } break; case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) return -EINVAL; if (params->chandef.width != sdata->u.ibss.chandef.width) return -EINVAL; switch (params->chandef.width) { case NL80211_CHAN_WIDTH_40: if (cfg80211_get_chandef_type(&params->chandef) != cfg80211_get_chandef_type(&sdata->u.ibss.chandef)) return -EINVAL; break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: break; default: return -EINVAL; } /* changes into another band are not supported */ if (sdata->u.ibss.chandef.chan->band != params->chandef.chan->band) return -EINVAL; /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_ibss_csa_beacon(sdata, params, changed); if (err < 0) return err; } ieee80211_send_action_csa(sdata, params); break; #ifdef CONFIG_MAC80211_MESH case NL80211_IFTYPE_MESH_POINT: { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; /* changes into another band are not supported */ if (sdata->vif.bss_conf.chanreq.oper.chan->band != params->chandef.chan->band) return -EINVAL; if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_NONE) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_INIT; if (!ifmsh->pre_value) ifmsh->pre_value = 1; else ifmsh->pre_value++; } /* see comments in the NL80211_IFTYPE_AP block */ if (params->count > 1) { err = ieee80211_mesh_csa_beacon(sdata, params, changed); if (err < 0) { ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; return err; } } if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) ieee80211_send_action_csa(sdata, params); break; } #endif default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_abort(struct ieee80211_link_data *link) { link->conf->color_change_active = false; ieee80211_free_next_beacon(link); cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id); } static int __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = params->chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_channel_switch ch_switch = { .link_id = params->link_id, }; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; struct ieee80211_bss_conf *link_conf; struct ieee80211_link_data *link_data; u64 changed = 0; u8 link_id = params->link_id; int err; lockdep_assert_wiphy(local->hw.wiphy); if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; if (sdata->wdev.links[link_id].cac_started) return -EBUSY; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return -EINVAL; link_data = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link_data) return -ENOLINK; link_conf = link_data->conf; if (chanreq.oper.punctured && !link_conf->eht_support) return -EINVAL; /* don't allow another channel switch if one is already active. */ if (link_conf->csa_active) return -EBUSY; conf = wiphy_dereference(wiphy, link_conf->chanctx_conf); if (!conf) { err = -EBUSY; goto out; } if (params->chandef.chan->freq_offset) { /* this may work, but is untested */ err = -EOPNOTSUPP; goto out; } chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; ch_switch.device_timestamp = 0; ch_switch.block_tx = params->block_tx; ch_switch.chandef = chanreq.oper; ch_switch.count = params->count; err = drv_pre_channel_switch(sdata, &ch_switch); if (err) goto out; err = ieee80211_link_reserve_chanctx(link_data, &chanreq, chanctx->mode, params->radar_required); if (err) goto out; /* if reservation is invalid then this will fail */ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; } /* if there is a color change in progress, abort it */ if (link_conf->color_change_active) ieee80211_color_change_abort(link_data); err = ieee80211_set_csa_beacon(link_data, params, &changed); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; } link_data->csa.chanreq = chanreq; link_conf->csa_active = true; if (params->block_tx) ieee80211_vif_block_queues_csa(sdata); cfg80211_ch_switch_started_notify(sdata->dev, &link_data->csa.chanreq.oper, link_id, params->count, params->block_tx); if (changed) { ieee80211_link_info_change_notify(sdata, link_data, changed); drv_channel_switch_beacon(sdata, &link_data->csa.chanreq.oper); } else { /* if the beacon didn't change, we can finalize immediately */ ieee80211_csa_finalize(link_data); } out: return err; } int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); return __ieee80211_channel_switch(wiphy, dev, params); } u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); local->roc_cookie_counter++; /* wow, you wrapped 64 bits ... more likely a bug */ if (WARN_ON(local->roc_cookie_counter == 0)) local->roc_cookie_counter++; return local->roc_cookie_counter; } int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp) { unsigned long spin_flags; struct sk_buff *ack_skb; int id; ack_skb = skb_copy(skb, gfp); if (!ack_skb) return -ENOMEM; spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { kfree_skb(ack_skb); return -ENOMEM; } IEEE80211_SKB_CB(skb)->status_data_idr = 1; IEEE80211_SKB_CB(skb)->status_data = id; *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; return 0; } static void ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); bool global_change, intf_change; global_change = (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || (local->rx_mcast_action_reg != !!(upd->global_mcast_stypes & action_mask)); local->probe_req_reg = upd->global_stypes & preq_mask; local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; intf_change = (sdata->vif.probe_req_reg != !!(upd->interface_stypes & preq_mask)) || (sdata->vif.rx_mcast_action_reg != !!(upd->interface_mcast_stypes & action_mask)); sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; sdata->vif.rx_mcast_action_reg = upd->interface_mcast_stypes & action_mask; if (!local->open_count) return; if (intf_change && ieee80211_sdata_running(sdata)) drv_config_iface_filter(local, sdata, sdata->vif.probe_req_reg ? FIF_PROBE_REQ : 0, FIF_PROBE_REQ); if (global_change) ieee80211_configure_filter(local); } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); int ret; if (local->started) return -EOPNOTSUPP; ret = drv_set_antenna(local, tx_ant, rx_ant); if (ret) return ret; local->rx_chains = hweight8(rx_ant); return 0; } static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); return drv_get_antenna(local, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!local->ops->set_rekey_data) return -EOPNOTSUPP; drv_set_rekey_data(local, sdata, data); return 0; } static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; struct ieee80211_tx_info *info; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum nl80211_band band; int ret; /* the lock is needed to assign the cookie later */ lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); sta = sta_info_get_bss(sdata, peer); if (!sta) { ret = -ENOLINK; goto unlock; } qos = sta->sta.wme; chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { ret = -EINVAL; goto unlock; } band = chanctx_conf->def.chan->band; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { ret = -ENOMEM; goto unlock; } skb->dev = dev; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_NL80211_FRAME_TX; info->band = band; skb_set_queue_mapping(skb, IEEE80211_AC_VO); skb->priority = 7; if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_ATOMIC); if (ret) { kfree_skb(skb); goto unlock; } local_bh_disable(); ieee80211_xmit(sdata, sta, skb); local_bh_enable(); ret = 0; unlock: rcu_read_unlock(); return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; int ret = -ENODATA; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (!link) { ret = -ENOLINK; goto out; } chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) { *chandef = link->conf->chanreq.oper; ret = 0; } else if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { *chandef = local->monitor_chanreq.oper; ret = 0; } out: rcu_read_unlock(); return ret; } #ifdef CONFIG_PM static void ieee80211_set_wakeup(struct wiphy *wiphy, bool enabled) { drv_set_wakeup(wiphy_priv(wiphy), enabled); } #endif static int ieee80211_set_qos_map(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_qos_map *qos_map) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct mac80211_qos_map *new_qos_map, *old_qos_map; if (qos_map) { new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL); if (!new_qos_map) return -ENOMEM; memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map)); } else { /* A NULL qos_map was passed to disable QoS mapping */ new_qos_map = NULL; } old_qos_map = sdata_dereference(sdata->qos_map, sdata); rcu_assign_pointer(sdata->qos_map, new_qos_map); if (old_qos_map) kfree_rcu(old_qos_map, rcu_head); return 0; } static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; u64 changed = 0; link = sdata_dereference(sdata->link[link_id], sdata); ret = ieee80211_link_change_chanreq(link, &chanreq, &changed); if (ret == 0) ieee80211_link_info_change_notify(sdata, link, changed); return ret; } static int ieee80211_add_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer, u8 up, u16 admitted_time) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ac = ieee802_1d_to_ac[up]; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!(sdata->wmm_acm & BIT(up))) return -EINVAL; if (ifmgd->tx_tspec[ac].admitted_time) return -EBUSY; if (admitted_time) { ifmgd->tx_tspec[ac].admitted_time = 32 * admitted_time; ifmgd->tx_tspec[ac].tsid = tsid; ifmgd->tx_tspec[ac].up = up; } return 0; } static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = wiphy_priv(wiphy); int ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; /* skip unused entries */ if (!tx_tspec->admitted_time) continue; if (tx_tspec->tsid != tsid) continue; /* due to this new packets will be reassigned to non-ACM ACs */ tx_tspec->up = -1; /* Make sure that all packets have been sent to avoid to * restore the QoS params on packets that are still on the * queues. */ synchronize_net(); ieee80211_flush_queues(local, sdata, false); /* restore the normal QoS parameters * (unconditionally to avoid races) */ tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; tx_tspec->downgraded = false; ieee80211_sta_handle_tspec_ac_params(sdata); /* finally clear all the data */ memset(tx_tspec, 0, sizeof(*tx_tspec)); return 0; } return -ENOENT; } void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, u8 inst_id, enum nl80211_nan_func_term_reason reason, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; u64 cookie; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } cookie = func->cookie; idr_remove(&sdata->u.nan.function_inst_ids, inst_id); spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_free_nan_func(func); cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id, reason, cookie, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_terminated); void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct cfg80211_nan_func *func; if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) return; spin_lock_bh(&sdata->u.nan.func_lock); func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id); if (WARN_ON(!func)) { spin_unlock_bh(&sdata->u.nan.func_lock); return; } match->cookie = func->cookie; spin_unlock_bh(&sdata->u.nan.func_lock); cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp); } EXPORT_SYMBOL(ieee80211_nan_func_match); static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, struct net_device *dev, const bool enabled) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->u.ap.multicast_to_unicast = enabled; return 0; } void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, struct txq_info *txqi) { if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES); txqstats->backlog_bytes = txqi->tin.backlog_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS); txqstats->backlog_packets = txqi->tin.backlog_packets; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS); txqstats->flows = txqi->tin.flows; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS); txqstats->drops = txqi->cstats.drop_count; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS); txqstats->ecn_marks = txqi->cstats.ecn_mark; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT); txqstats->overlimit = txqi->tin.overlimit; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS); txqstats->collisions = txqi->tin.collisions; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES); txqstats->tx_bytes = txqi->tin.tx_bytes; } if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) { txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS); txqstats->tx_packets = txqi->tin.tx_packets; } } static int ieee80211_get_txq_stats(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; int ret = 0; spin_lock_bh(&local->fq.lock); rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (!sdata->vif.txq) { ret = 1; goto out; } ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq)); } else { /* phy stats */ txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) | BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) | BIT(NL80211_TXQ_STATS_OVERLIMIT) | BIT(NL80211_TXQ_STATS_OVERMEMORY) | BIT(NL80211_TXQ_STATS_COLLISIONS) | BIT(NL80211_TXQ_STATS_MAX_FLOWS); txqstats->backlog_packets = local->fq.backlog; txqstats->backlog_bytes = local->fq.memory_usage; txqstats->overlimit = local->fq.overlimit; txqstats->overmemory = local->fq.overmemory; txqstats->collisions = local->fq.collisions; txqstats->max_flows = local->fq.flows_cnt; } out: rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; } static int ieee80211_get_ftm_responder_stats(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); return drv_get_ftm_responder_stats(local, sdata, ftm_stats); } static int ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_start_pmsr(local, sdata, request); } static void ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev, struct cfg80211_pmsr_request *request) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev); return drv_abort_pmsr(local, sdata, request); } static int ieee80211_set_tid_config(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!sdata->local->ops->set_tid_config) return -EOPNOTSUPP; if (!tid_conf->peer) return drv_set_tid_config(sdata->local, sdata, NULL, tid_conf); sta = sta_info_get_bss(sdata, tid_conf->peer); if (!sta) return -ENOENT; return drv_set_tid_config(sdata->local, sdata, &sta->sta, tid_conf); } static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!sdata->local->ops->reset_tid_config) return -EOPNOTSUPP; if (!peer) return drv_reset_tid_config(sdata->local, sdata, NULL, tids); sta = sta_info_get_bss(sdata, peer); if (!sta) return -ENOENT; return drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); } static int ieee80211_set_sar_specs(struct wiphy *wiphy, struct cfg80211_sar_specs *sar) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_sar_specs) return -EOPNOTSUPP; return local->ops->set_sar_specs(&local->hw, sar); } static int ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { int ret; if (!link->u.ap.next_beacon) return -EINVAL; ret = ieee80211_assign_beacon(sdata, link, link->u.ap.next_beacon, NULL, NULL, changed); ieee80211_free_next_beacon(link); if (ret < 0) return ret; break; } default: WARN_ON_ONCE(1); return -EINVAL; } return 0; } static int ieee80211_set_color_change_beacon(struct ieee80211_link_data *link, struct cfg80211_color_change_settings *params, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_color_change_settings color_change = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: link->u.ap.next_beacon = cfg80211_beacon_dup(&params->beacon_next); if (!link->u.ap.next_beacon) return -ENOMEM; if (params->count <= 1) break; color_change.counter_offset_beacon = params->counter_offset_beacon; color_change.counter_offset_presp = params->counter_offset_presp; color_change.count = params->count; err = ieee80211_assign_beacon(sdata, link, &params->beacon_color_change, NULL, &color_change, changed); if (err < 0) { ieee80211_free_next_beacon(link); return err; } break; default: return -EOPNOTSUPP; } return 0; } static void ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, u8 color, int enable, u64 changed) { struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); link->conf->he_bss_color.color = color; link->conf->he_bss_color.enabled = enable; changed |= BSS_CHANGED_HE_BSS_COLOR; ieee80211_link_info_change_notify(sdata, link, changed); if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { struct ieee80211_sub_if_data *child; list_for_each_entry(child, &sdata->local->interfaces, list) { if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) { child->vif.bss_conf.he_bss_color.color = color; child->vif.bss_conf.he_bss_color.enabled = enable; ieee80211_link_info_change_notify(child, &child->deflink, BSS_CHANGED_HE_BSS_COLOR); } } } } static int ieee80211_color_change_finalize(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); link->conf->color_change_active = false; err = ieee80211_set_after_color_change_beacon(link, &changed); if (err) { cfg80211_color_change_aborted_notify(sdata->dev, link->link_id); return err; } ieee80211_color_change_bss_config_notify(link, link->conf->color_change_color, 1, changed); cfg80211_color_change_notify(sdata->dev, link->link_id); return 0; } void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, color_change_finalize_work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ if (!link_conf->color_change_active) return; if (!ieee80211_sdata_running(sdata)) return; ieee80211_color_change_finalize(link); } void ieee80211_color_collision_detection_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, color_collision_detect_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, link->link_id); } void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } wiphy_work_queue(sdata->local->hw.wiphy, &link->color_change_finalize_work); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, u64 color_bitmap, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_link_data *link; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; rcu_read_lock(); link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } if (link->conf->color_change_active || link->conf->csa_active) { rcu_read_unlock(); return; } if (wiphy_delayed_work_pending(sdata->local->hw.wiphy, &link->color_collision_detect_work)) { rcu_read_unlock(); return; } link->color_bitmap = color_bitmap; /* queue the color collision detection event every 500 ms in order to * avoid sending too much netlink messages to userspace. */ wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->color_collision_detect_work, msecs_to_jiffies(500)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); static int ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_color_change_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_bss_conf *link_conf; struct ieee80211_link_data *link; u8 link_id = params->link_id; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return -EINVAL; link = wiphy_dereference(wiphy, sdata->link[link_id]); if (!link) return -ENOLINK; link_conf = link->conf; if (link_conf->nontransmitted) return -EINVAL; /* don't allow another color change if one is already active or if csa * is active */ if (link_conf->color_change_active || link_conf->csa_active) { err = -EBUSY; goto out; } err = ieee80211_set_color_change_beacon(link, params, &changed); if (err) goto out; link_conf->color_change_active = true; link_conf->color_change_color = params->color; cfg80211_color_change_started_notify(sdata->dev, params->count, link_id); if (changed) ieee80211_color_change_bss_config_notify(link, 0, 0, changed); else /* if the beacon didn't change, we can finalize immediately */ ieee80211_color_change_finalize(link); out: return err; } static int ieee80211_set_radar_background(struct wiphy *wiphy, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); if (!local->ops->set_radar_background) return -EOPNOTSUPP; return local->ops->set_radar_background(&local->hw, chandef); } static int ieee80211_add_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (wdev->use_4addr) return -EOPNOTSUPP; return ieee80211_vif_set_links(sdata, wdev->valid_links, 0); } static void ieee80211_del_intf_link(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); u16 new_links = wdev->valid_links & ~BIT(link_id); lockdep_assert_wiphy(sdata->local->hw.wiphy); /* During the link teardown process, certain functions require the * link_id to remain in the valid_links bitmap. Therefore, instead * of removing the link_id from the bitmap, pass a masked value to * simulate as if link_id does not exist anymore. */ ieee80211_vif_set_links(sdata, new_links, 0); } static int ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; int ret; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!sta->sta.valid_links) return -EINVAL; if (sta->sta.valid_links & BIT(params->link_id)) return -EALREADY; ret = ieee80211_sta_allocate_link(sta, params->link_id); if (ret) return ret; ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_NEW, params); if (ret) { ieee80211_sta_free_link(sta, params->link_id); return ret; } if (test_sta_flag(sta, WLAN_STA_ASSOC)) { struct link_sta_info *link_sta; link_sta = sdata_dereference(sta->link[params->link_id], sdata); rate_control_rate_init(link_sta); } /* ieee80211_sta_activate_link frees the link upon failure */ return ieee80211_sta_activate_link(sta, params->link_id); } static int ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = wiphy_priv(wiphy); struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; return sta_link_apply_parameters(local, sta, STA_LINK_MODE_LINK_MODIFY, params); } static int ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev, struct link_station_del_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, params->mld_mac); if (!sta) return -ENOENT; if (!(sta->sta.valid_links & BIT(params->link_id))) return -EINVAL; /* must not create a STA without links */ if (sta->sta.valid_links == BIT(params->link_id)) return -EINVAL; ieee80211_sta_remove_link(sta, params->link_id); return 0; } static int ieee80211_set_hw_timestamp(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; if (!local->ops->set_hw_timestamp) return -EOPNOTSUPP; if (!check_sdata_in_driver(sdata)) return -EIO; return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts); } static int ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); return ieee80211_req_neg_ttlm(sdata, params); } static int ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_assoc_link *add_links, u16 rem_links) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); return ieee80211_mgd_assoc_ml_reconf(sdata, add_links, rem_links); } const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, .change_virtual_intf = ieee80211_change_iface, .start_p2p_device = ieee80211_start_p2p_device, .stop_p2p_device = ieee80211_stop_p2p_device, .add_key = ieee80211_add_key, .del_key = ieee80211_del_key, .get_key = ieee80211_get_key, .set_default_key = ieee80211_config_default_key, .set_default_mgmt_key = ieee80211_config_default_mgmt_key, .set_default_beacon_key = ieee80211_config_default_beacon_key, .start_ap = ieee80211_start_ap, .change_beacon = ieee80211_change_beacon, .stop_ap = ieee80211_stop_ap, .add_station = ieee80211_add_station, .del_station = ieee80211_del_station, .change_station = ieee80211_change_station, .get_station = ieee80211_get_station, .dump_station = ieee80211_dump_station, .dump_survey = ieee80211_dump_survey, #ifdef CONFIG_MAC80211_MESH .add_mpath = ieee80211_add_mpath, .del_mpath = ieee80211_del_mpath, .change_mpath = ieee80211_change_mpath, .get_mpath = ieee80211_get_mpath, .dump_mpath = ieee80211_dump_mpath, .get_mpp = ieee80211_get_mpp, .dump_mpp = ieee80211_dump_mpp, .update_mesh_config = ieee80211_update_mesh_config, .get_mesh_config = ieee80211_get_mesh_config, .join_mesh = ieee80211_join_mesh, .leave_mesh = ieee80211_leave_mesh, #endif .join_ocb = ieee80211_join_ocb, .leave_ocb = ieee80211_leave_ocb, .change_bss = ieee80211_change_bss, .inform_bss = ieee80211_inform_bss, .set_txq_params = ieee80211_set_txq_params, .set_monitor_channel = ieee80211_set_monitor_channel, .suspend = ieee80211_suspend, .resume = ieee80211_resume, .scan = ieee80211_scan, .abort_scan = ieee80211_abort_scan, .sched_scan_start = ieee80211_sched_scan_start, .sched_scan_stop = ieee80211_sched_scan_stop, .auth = ieee80211_auth, .assoc = ieee80211_assoc, .deauth = ieee80211_deauth, .disassoc = ieee80211_disassoc, .join_ibss = ieee80211_join_ibss, .leave_ibss = ieee80211_leave_ibss, .set_mcast_rate = ieee80211_set_mcast_rate, .set_wiphy_params = ieee80211_set_wiphy_params, .set_tx_power = ieee80211_set_tx_power, .get_tx_power = ieee80211_get_tx_power, .rfkill_poll = ieee80211_rfkill_poll, CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd) CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump) .set_power_mgmt = ieee80211_set_power_mgmt, .set_bitrate_mask = ieee80211_set_bitrate_mask, .remain_on_channel = ieee80211_remain_on_channel, .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, .mgmt_tx = ieee80211_mgmt_tx, .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, .update_mgmt_frame_registrations = ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, .tdls_oper = ieee80211_tdls_oper, .tdls_mgmt = ieee80211_tdls_mgmt, .tdls_channel_switch = ieee80211_tdls_channel_switch, .tdls_cancel_channel_switch = ieee80211_tdls_cancel_channel_switch, .probe_client = ieee80211_probe_client, .set_noack_map = ieee80211_set_noack_map, #ifdef CONFIG_PM .set_wakeup = ieee80211_set_wakeup, #endif .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, .end_cac = ieee80211_end_cac, .channel_switch = ieee80211_channel_switch, .set_qos_map = ieee80211_set_qos_map, .set_ap_chanwidth = ieee80211_set_ap_chanwidth, .add_tx_ts = ieee80211_add_tx_ts, .del_tx_ts = ieee80211_del_tx_ts, .start_nan = ieee80211_start_nan, .stop_nan = ieee80211_stop_nan, .nan_change_conf = ieee80211_nan_change_conf, .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, .tx_control_port = ieee80211_tx_control_port, .get_txq_stats = ieee80211_get_txq_stats, .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, .start_pmsr = ieee80211_start_pmsr, .abort_pmsr = ieee80211_abort_pmsr, .probe_mesh_link = ieee80211_probe_mesh_link, .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, .color_change = ieee80211_color_change, .set_radar_background = ieee80211_set_radar_background, .add_intf_link = ieee80211_add_intf_link, .del_intf_link = ieee80211_del_intf_link, .add_link_station = ieee80211_add_link_station, .mod_link_station = ieee80211_mod_link_station, .del_link_station = ieee80211_del_link_station, .set_hw_timestamp = ieee80211_set_hw_timestamp, .set_ttlm = ieee80211_set_ttlm, .get_radio_mask = ieee80211_get_radio_mask, .assoc_ml_reconf = ieee80211_assoc_ml_reconf, };
25 25 25 25 3 15 15 15 15 15 3 15 1 15 15 15 1 14 1 3 2 2 2 2 15 15 14 14 9 9 7 4 13 12 11 1 10 1 1 9 9 9 9 2 2 2 14 15 15 2 12 14 14 12 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_health.h" /* * Local function prototypes. */ static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, int first, int last); static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, int *entno); static int xfs_dir2_block_sort(const void *a, const void *b); static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; /* * One-time startup routine called from xfs_init(). */ void xfs_dir_startup(void) { xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } static xfs_failaddr_t xfs_dir3_block_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; } return __xfs_dir3_data_check(NULL, bp); } static void xfs_dir3_block_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_dir3_block_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void xfs_dir3_block_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_dir3_block_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, }; xfs_failaddr_t xfs_dir3_block_header_check( struct xfs_buf *bp, xfs_ino_t owner) { struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) return __this_address; if (be64_to_cpu(hdr3->owner) != owner) return __this_address; } return NULL; } int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_ino_t owner, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); if (err || !*bpp) return err; /* Check things that we can't do in the verifier. */ fa = xfs_dir3_block_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } static void xfs_dir3_block_init( struct xfs_da_args *args, struct xfs_buf *bp) { struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); if (xfs_has_crc(mp)) { memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; } hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); } static void xfs_dir2_block_need_space( struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, struct xfs_dir2_leaf_entry *blp, __be16 **tagpp, struct xfs_dir2_data_unused **dupp, struct xfs_dir2_data_unused **enddupp, int *compact, int len) { struct xfs_dir2_data_free *bf; __be16 *tagp = NULL; struct xfs_dir2_data_unused *dup = NULL; struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr); /* * If there are stale entries we'll use one for the leaf. */ if (btp->stale) { if (be16_to_cpu(bf[0].length) >= len) { /* * The biggest entry enough to avoid compaction. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); goto out; } /* * Will need to compact to make this work. * Tag just before the first leaf entry. */ *compact = 1; tagp = (__be16 *)blp - 1; /* Data object just before the first leaf entry. */ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free then the data will go where the * leaf data starts now, if it works at all. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) dup = NULL; } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) dup = NULL; else dup = (xfs_dir2_data_unused_t *)blp; goto out; } /* * no stale entries, so just use free space. * Tag just before the first leaf entry. */ tagp = (__be16 *)blp - 1; /* Data object just before the first leaf entry. */ enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free then can't do this add without cleaning up: * the space before the first leaf entry needs to be free so it * can be expanded to hold the pointer to the new entry. */ if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { /* * Check out the biggest freespace and see if it's the same one. */ dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[0].offset)); if (dup != enddup) { /* * Not the same free entry, just check its length. */ if (be16_to_cpu(dup->length) < len) dup = NULL; goto out; } /* * It is the biggest freespace, can it hold the leaf too? */ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { /* * Yes, use the second-largest entry instead if it works. */ if (be16_to_cpu(bf[1].length) >= len) dup = (xfs_dir2_data_unused_t *) ((char *)hdr + be16_to_cpu(bf[1].offset)); else dup = NULL; } } out: *tagpp = tagp; *dupp = dup; *enddupp = enddup; } /* * compact the leaf entries. * Leave the highest-numbered stale entry stale. * XXX should be the one closest to mid but mid is not yet computed. */ static void xfs_dir2_block_compact( struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, struct xfs_dir2_leaf_entry *blp, int *needlog, int *lfloghigh, int *lfloglow) { int fromidx; /* source leaf index */ int toidx; /* target leaf index */ int needscan = 0; int highstale; /* high stale index */ fromidx = toidx = be32_to_cpu(btp->count) - 1; highstale = *lfloghigh = -1; for (; fromidx >= 0; fromidx--) { if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (highstale == -1) highstale = toidx; else { if (*lfloghigh == -1) *lfloghigh = toidx; continue; } } if (fromidx < toidx) blp[toidx] = blp[fromidx]; toidx--; } *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); *lfloghigh -= be32_to_cpu(btp->stale) - 1; be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), needlog, &needscan); btp->stale = cpu_to_be32(1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog); } /* * Add an entry to a block directory. */ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ int compact; /* need to compact leaf ents */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* directory inode */ xfs_dir2_data_unused_t *dup; /* block unused entry */ int error; /* error return value */ xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ xfs_dahash_t hash; /* hash value of found entry */ int high; /* high index for binary srch */ int highstale; /* high stale index */ int lfloghigh=0; /* last final leaf to log */ int lfloglow=0; /* first final leaf to log */ int len; /* length of the new entry */ int low; /* low index for binary srch */ int lowstale; /* low stale index */ int mid=0; /* midpoint for binary srch */ int needlog; /* need to log header */ int needscan; /* need to rescan freespace */ __be16 *tagp; /* pointer to tag value */ xfs_trans_t *tp; /* transaction structure */ trace_xfs_dir2_block_addname(args); dp = args->dp; tp = args->trans; /* Read the (one and only) directory block into bp. */ error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; len = xfs_dir2_data_entsize(dp->i_mount, args->namelen); /* * Set up pointers to parts of the block. */ hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Find out if we can reuse stale entries or whether we need extra * space for entry and new leaf. */ xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, &enddup, &compact, len); /* * Done everything we need for a space check now. */ if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); if (!dup) return -ENOSPC; return 0; } /* * If we don't have space for the new entry & leaf ... */ if (!dup) { /* Don't have a space reservation: return no-space. */ if (args->total == 0) return -ENOSPC; /* * Convert to the next larger format. * Then add the new entry in that format. */ error = xfs_dir2_block_to_leaf(args, bp); if (error) return error; return xfs_dir2_leaf_addname(args); } needlog = needscan = 0; /* * If need to compact the leaf entries, do it now. */ if (compact) { xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); /* recalculate blp post-compaction */ blp = xfs_dir2_block_leaf_p(btp); } else if (btp->stale) { /* * Set leaf logging boundaries to impossible state. * For the no-stale case they're set explicitly. */ lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } /* * Find the slot that's first lower than our hash value, -1 if none. */ for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) { mid = (low + high) >> 1; if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) break; if (hash < args->hashval) low = mid + 1; else high = mid - 1; } while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) { mid--; } /* * No stale entries, will use enddup space to hold new leaf. */ if (!btp->stale) { xfs_dir2_data_aoff_t aoff; /* * Mark the space needed for the new leaf entry, now in use. */ aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)); error = xfs_dir2_data_use_free(args, bp, enddup, aoff, (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog, &needscan); if (error) return error; /* * Update the tail (entry count). */ be32_add_cpu(&btp->count, 1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) { xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); needscan = 0; } /* * Adjust pointer to the first leaf entry, we're about to move * the table up one to open up space for the new leaf entry. * Then adjust our index to match. */ blp--; mid++; if (mid) memmove(blp, &blp[1], mid * sizeof(*blp)); lfloglow = 0; lfloghigh = mid; } /* * Use a stale leaf for our new entry. */ else { for (lowstale = mid; lowstale >= 0 && blp[lowstale].address != cpu_to_be32(XFS_DIR2_NULL_DATAPTR); lowstale--) continue; for (highstale = mid + 1; highstale < be32_to_cpu(btp->count) && blp[highstale].address != cpu_to_be32(XFS_DIR2_NULL_DATAPTR) && (lowstale < 0 || mid - lowstale > highstale - mid); highstale++) continue; /* * Move entries toward the low-numbered stale entry. */ if (lowstale >= 0 && (highstale == be32_to_cpu(btp->count) || mid - lowstale <= highstale - mid)) { if (mid - lowstale) memmove(&blp[lowstale], &blp[lowstale + 1], (mid - lowstale) * sizeof(*blp)); lfloglow = min(lowstale, lfloglow); lfloghigh = max(mid, lfloghigh); } /* * Move entries toward the high-numbered stale entry. */ else { ASSERT(highstale < be32_to_cpu(btp->count)); mid++; if (highstale - mid) memmove(&blp[mid + 1], &blp[mid], (highstale - mid) * sizeof(*blp)); lfloglow = min(mid, lfloglow); lfloghigh = max(highstale, lfloghigh); } be32_add_cpu(&btp->stale, -1); } /* * Point to the new data entry. */ dep = (xfs_dir2_data_entry_t *)dup; /* * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ error = xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); if (error) return error; /* * Create the new data entry. */ dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } /* * Log leaf entries from the block. */ static void xfs_dir2_block_log_leaf( xfs_trans_t *tp, /* transaction structure */ struct xfs_buf *bp, /* block buffer */ int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); blp = xfs_dir2_block_leaf_p(btp); xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } /* * Log the block tail. */ static void xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ struct xfs_buf *bp) /* block buffer */ { xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } /* * Look up an entry in the block. This is the external routine, * xfs_dir2_block_lookup_int does the real work. */ int /* error */ xfs_dir2_block_lookup( xfs_da_args_t *args) /* dir lookup arguments */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int ent; /* entry index */ int error; /* error return value */ trace_xfs_dir2_block_lookup(args); /* * Get the buffer, look up the entry. * If not found (ENOENT) then return, have no buffer. */ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) return error; dp = args->dp; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(blp[ent].address))); /* * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return error; } /* * Internal block lookup routine. */ static int /* error */ xfs_dir2_block_lookup_int( xfs_da_args_t *args, /* dir lookup arguments */ struct xfs_buf **bpp, /* returned block buffer */ int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int error; /* error return value */ xfs_dahash_t hash; /* found hash value */ int high; /* binary search high index */ int low; /* binary search low index */ int mid; /* binary search current idx */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; error = xfs_dir3_block_read(tp, dp, args->owner, &bp); if (error) return error; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. * Find our entry, ENOENT if it's not there. */ for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) { ASSERT(low <= high); mid = (low + high) >> 1; if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) break; if (hash < args->hashval) low = mid + 1; else high = mid - 1; if (low > high) { ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); xfs_trans_brelse(tp, bp); return -ENOENT; } } /* * Back up to the first one with the right hash value. */ while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) { mid--; } /* * Now loop forward through all the entries with the * right hash value looking for our name. */ do { if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR) continue; /* * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store * the index and buffer and continue looking for an exact match. */ cmp = xfs_dir2_compname(args, dep->name, dep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; *bpp = bp; *entno = mid; if (cmp == XFS_CMP_EXACT) return 0; } } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash); ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); /* * Here, we can only be doing a lookup (not a rename or replace). * If a case-insensitive match was found earlier, return success. */ if (args->cmpresult == XFS_CMP_CASE) return 0; /* * No match, release the buffer and return ENOENT. */ xfs_trans_brelse(tp, bp); return -ENOENT; } /* * Remove an entry from a block format directory. * If that makes the block small enough to fit in shortform, transform it. */ int /* error */ xfs_dir2_block_removename( xfs_da_args_t *args) /* directory operation args */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int ent; /* block leaf entry index */ int error; /* error return value */ int needlog; /* need to log block header */ int needscan; /* need to fixup bestfree */ xfs_dir2_sf_hdr_t sfh; /* shortform header */ int size; /* shortform size */ xfs_trans_t *tp; /* transaction pointer */ trace_xfs_dir2_block_removename(args); /* * Look up the entry in the block. Gets the buffer and entry index. * It will always be there, the vnodeops level does a lookup first. */ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { return error; } dp = args->dp; tp = args->trans; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ be32_add_cpu(&btp->stale, 1); xfs_dir2_block_log_tail(tp, bp); /* * Remove the leaf entry by marking it stale. */ blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); xfs_dir2_block_log_leaf(tp, bp, ent, ent); /* * Fix up bestfree, log the header if necessary. */ if (needscan) xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, bp); xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); if (size > xfs_inode_data_fork_size(dp)) return 0; /* * If it works, do the conversion. */ return xfs_dir2_block_to_sf(args, bp, size, &sfh); } /* * Replace an entry in a V2 block directory. * Change the inode number to the new value. */ int /* error */ xfs_dir2_block_replace( xfs_da_args_t *args) /* directory operation args */ { xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ int ent; /* leaf entry index */ int error; /* error return value */ trace_xfs_dir2_block_replace(args); /* * Lookup the entry in the directory. Get buffer and entry index. * This will always succeed since the caller has already done a lookup. */ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { return error; } dp = args->dp; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype); xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } /* * Qsort comparison routine for the block leaf entries. */ static int /* sort order */ xfs_dir2_block_sort( const void *a, /* first leaf entry */ const void *b) /* second leaf entry */ { const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ la = a; lb = b; return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 : (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0); } /* * Convert a V2 leaf directory to a V2 block directory if possible. */ int /* error */ xfs_dir2_leaf_to_block( xfs_da_args_t *args, /* operation arguments */ struct xfs_buf *lbp, /* leaf buffer */ struct xfs_buf *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused data entry */ int error; /* error return value */ int from; /* leaf from index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ xfs_mount_t *mp; /* file system mount point */ int needlog; /* need to log data header */ int needscan; /* need to scan for bestfree */ xfs_dir2_sf_hdr_t sfh; /* shortform header */ int size; /* bytes used */ __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ while (dp->i_disk_size > args->geo->blksize) { int hdrsz; hdrsz = args->geo->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == args->geo->blksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) return error; } else return 0; } /* * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { error = xfs_dir3_data_read(tp, dp, args->owner, args->geo->datablk, 0, &dbp); if (error) return error; } hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); /* * Size of the "leaf" area in the block. */ size = (uint)sizeof(xfs_dir2_block_tail_t) + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. */ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || be16_to_cpu(dup->length) < size) return 0; /* * Start converting it to block form. */ xfs_dir3_block_init(args, dbp); needlog = 1; needscan = 0; /* * Use up the space at the end of the block (blp/btp). */ error = xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, &needlog, &needscan); if (error) return error; /* * Initialize the block tail. */ btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ lep = xfs_dir2_block_leaf_p(btp); for (from = to = 0; from < leafhdr.count; from++) { if (leafhdr.ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; lep[to++] = leafhdr.ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); /* * Scan the bestfree if we need it and log the data block header. */ if (needscan) xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(args, dbp); /* * Pitch the old leaf block. */ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); if (error) return error; /* * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); if (size > xfs_inode_data_fork_size(dp)) return 0; return xfs_dir2_block_to_sf(args, dbp, size, &sfh); } /* * Convert the shortform directory to block form. */ int /* error */ xfs_dir2_sf_to_block( struct xfs_da_args *args) { struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ int dummy; /* trash */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ int endoffset; /* end of data objects */ int error; /* error return value */ int i; /* index */ int needlog; /* need to log block header */ int needscan; /* need to scan block freespc */ int newoffset; /* offset from current entry */ unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data; xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ struct xfs_name name; trace_xfs_dir2_sf_to_block(args); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(ifp->if_bytes == dp->i_disk_size); ASSERT(oldsfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); /* * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); dp->i_disk_size = 0; /* * Add block 0 to the inode. */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); if (error) goto out_free; /* * Initialize the data block, then convert it to block format. */ error = xfs_dir3_data_init(args, blkno, &bp); if (error) goto out_free; xfs_dir3_block_init(args, bp); hdr = bp->b_addr; /* * Compute size of block "tail" area. */ i = (uint)sizeof(*btp) + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); /* * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ dup = bp->b_addr + offset; needlog = needscan = 0; error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, i, &needlog, &needscan); if (error) goto out_free; ASSERT(needscan == 0); /* * Fill in the tail. */ btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; blp = xfs_dir2_block_leaf_p(btp); endoffset = (uint)((char *)blp - (char *)hdr); /* * Remove the freespace, we'll manage it. */ error = xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); if (error) goto out_free; /* * Create entry for . */ dep = bp->b_addr + offset; dep->inumber = cpu_to_be64(args->owner); dep->namelen = 1; dep->name[0] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); tagp = xfs_dir2_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); offset += xfs_dir2_data_entsize(mp, dep->namelen); /* * Create entry for .. */ dep = bp->b_addr + offset; dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR); tagp = xfs_dir2_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16(offset); xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset)); offset += xfs_dir2_data_entsize(mp, dep->namelen); /* * Loop over existing entries, stuff them in. */ i = 0; if (!sfp->count) sfep = NULL; else sfep = xfs_dir2_sf_firstentry(sfp); /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. */ while (offset < endoffset) { /* * sfep is null when we reach the end of the list. */ if (sfep == NULL) newoffset = endoffset; else newoffset = xfs_dir2_sf_get_offset(sfep); /* * There should be a hole here, make one. */ if (offset < newoffset) { dup = bp->b_addr + offset; dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset); xfs_dir2_data_log_unused(args, bp, dup); xfs_dir2_data_freeinsert(hdr, xfs_dir2_data_bestfree_p(mp, hdr), dup, &dummy); offset += be16_to_cpu(dup->length); continue; } /* * Copy a real entry. */ dep = bp->b_addr + newoffset; dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep)); dep->namelen = sfep->namelen; xfs_dir2_data_put_ftype(mp, dep, xfs_dir2_sf_get_ftype(mp, sfep)); memcpy(dep->name, sfep->name, dep->namelen); tagp = xfs_dir2_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16(newoffset); xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name)); blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) sfep = NULL; else sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ kfree(sfp); /* * Sort the leaf entries by hash value. */ xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort); /* * Log the leaf entry area and tail. * Already logged the header in data_init, ignore needlog. */ ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); xfs_dir3_data_check(dp, bp); return 0; out_free: kfree(sfp); return error; }
4 4 2 3 3 1 1 1 2 2 2 2 2 2 1 1 1 1 7 1 3 3 14 6 8 1 14 4 3 3 3 3 3 3 4 4 1 3 15 3 11 1 14 14 2 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 2005-2006 Ian Kent <raven@themaw.net> */ #include <linux/seq_file.h> #include <linux/pagemap.h> #include "autofs_i.h" struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; ino = kzalloc(sizeof(*ino), GFP_KERNEL); if (ino) { INIT_LIST_HEAD(&ino->active); INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; ino->exp_timeout = -1; ino->count = 1; } return ino; } void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->exp_timeout = -1; ino->last_used = jiffies; } void autofs_free_ino(struct autofs_info *ino) { kfree_rcu(ino, rcu); } void autofs_kill_sb(struct super_block *sb) { struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock * info is not present so nothing else has been setup, so * just call kill_anon_super when we are called from * deactivate_super. */ if (sbi) { /* Free wait queues, close pipe */ autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } pr_debug("shutting down\n"); kill_litter_super(sb); if (sbi) kfree_rcu(sbi, rcu); } static int autofs_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, root_inode->i_uid)); if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); if (autofs_type_offset(sbi->type)) seq_puts(m, ",offset"); else if (autofs_type_direct(sbi->type)) seq_puts(m, ",direct"); else seq_puts(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_puts(m, ",strictexpire"); if (sbi->flags & AUTOFS_SBI_IGNORE) seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else seq_puts(m, ",pipe_ino=-1"); #endif return 0; } static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } static const struct super_operations autofs_sops = { .statfs = simple_statfs, .show_options = autofs_show_options, .evict_inode = autofs_evict_inode, }; enum { Opt_direct, Opt_fd, Opt_gid, Opt_ignore, Opt_indirect, Opt_maxproto, Opt_minproto, Opt_offset, Opt_pgrp, Opt_strictexpire, Opt_uid, }; const struct fs_parameter_spec autofs_param_specs[] = { fsparam_flag ("direct", Opt_direct), fsparam_fd ("fd", Opt_fd), fsparam_gid ("gid", Opt_gid), fsparam_flag ("ignore", Opt_ignore), fsparam_flag ("indirect", Opt_indirect), fsparam_u32 ("maxproto", Opt_maxproto), fsparam_u32 ("minproto", Opt_minproto), fsparam_flag ("offset", Opt_offset), fsparam_u32 ("pgrp", Opt_pgrp), fsparam_flag ("strictexpire", Opt_strictexpire), fsparam_uid ("uid", Opt_uid), {} }; struct autofs_fs_context { kuid_t uid; kgid_t gid; int pgrp; bool pgrp_set; }; /* * Open the fd. We do it here rather than in get_tree so that it's done in the * context of the system call that passed the data and not the one that * triggered the superblock creation, lest the fd gets reassigned. */ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, struct fs_parameter *param, struct fs_parse_result *result) { struct file *pipe; int ret; if (param->type == fs_value_is_file) { /* came through the new api */ pipe = param->file; param->file = NULL; } else { pipe = fget(result->uint_32); } if (!pipe) { errorf(fc, "could not open pipe file descriptor"); return -EBADF; } ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); fput(pipe); return -EBADF; } autofs_set_packet_pipe_flags(pipe); if (sbi->pipe) fput(sbi->pipe); sbi->pipefd = result->uint_32; sbi->pipe = pipe; return 0; } static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, autofs_param_specs, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_fd: return autofs_parse_fd(fc, sbi, param, &result); case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_pgrp: ctx->pgrp = result.uint_32; ctx->pgrp_set = true; break; case Opt_minproto: sbi->min_proto = result.uint_32; break; case Opt_maxproto: sbi->max_proto = result.uint_32; break; case Opt_indirect: set_autofs_type_indirect(&sbi->type); break; case Opt_direct: set_autofs_type_direct(&sbi->type); break; case Opt_offset: set_autofs_type_offset(&sbi->type); break; case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; case Opt_ignore: sbi->flags |= AUTOFS_SBI_IGNORE; } return 0; } static struct autofs_sb_info *autofs_alloc_sbi(void) { struct autofs_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->magic = AUTOFS_SBI_MAGIC; sbi->flags = AUTOFS_SBI_CATATONIC; sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; sbi->pipefd = -1; set_autofs_type_indirect(&sbi->type); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); spin_lock_init(&sbi->lookup_lock); INIT_LIST_HEAD(&sbi->active_list); INIT_LIST_HEAD(&sbi->expiring_list); return sbi; } static int autofs_validate_protocol(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; /* Test versions first */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { errorf(fc, "kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); return -EINVAL; } /* Establish highest kernel protocol version */ if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) sbi->version = AUTOFS_MAX_PROTO_VERSION; else sbi->version = sbi->max_proto; switch (sbi->version) { case 4: sbi->sub_version = 7; break; case 5: sbi->sub_version = AUTOFS_PROTO_SUBVERSION; break; default: sbi->sub_version = 0; } return 0; } static int autofs_fill_super(struct super_block *s, struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = s->s_fs_info; struct inode *root_inode; struct autofs_info *ino; pr_debug("starting up, sbi = %p\n", sbi); sbi->sb = s; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs_sops; s->s_d_op = &autofs_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ ino = autofs_new_ino(sbi); if (!ino) return -ENOMEM; root_inode = autofs_get_inode(s, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; root_inode->i_uid = ctx->uid; root_inode->i_gid = ctx->gid; root_inode->i_fop = &autofs_root_operations; root_inode->i_op = &autofs_dir_inode_operations; s->s_root = d_make_root(root_inode); if (unlikely(!s->s_root)) { autofs_free_ino(ino); return -ENOMEM; } s->s_root->d_fsdata = ino; if (ctx->pgrp_set) { sbi->oz_pgrp = find_get_pid(ctx->pgrp); if (!sbi->oz_pgrp) return invalf(fc, "Could not find process group %d", ctx->pgrp); } else sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); if (autofs_type_trigger(sbi->type)) /* s->s_root won't be contended so there's little to * be gained by not taking the d_lock when setting * d_flags, even when a lot mounts are being done. */ managed_dentry_set_managed(s->s_root); pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); sbi->flags &= ~AUTOFS_SBI_CATATONIC; return 0; } /* * Validate the parameters and then request a superblock. */ static int autofs_get_tree(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; int ret; ret = autofs_validate_protocol(fc); if (ret) return ret; if (sbi->pipefd < 0) return invalf(fc, "No control pipe specified"); return get_tree_nodev(fc, autofs_fill_super); } static void autofs_free_fc(struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; if (sbi) { if (sbi->pipe) fput(sbi->pipe); kfree(sbi); } kfree(ctx); } static const struct fs_context_operations autofs_context_ops = { .free = autofs_free_fc, .parse_param = autofs_parse_param, .get_tree = autofs_get_tree, }; /* * Set up the filesystem mount context. */ int autofs_init_fs_context(struct fs_context *fc) { struct autofs_fs_context *ctx; struct autofs_sb_info *sbi; ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL); if (!ctx) goto nomem; ctx->uid = current_uid(); ctx->gid = current_gid(); sbi = autofs_alloc_sbi(); if (!sbi) goto nomem_ctx; fc->fs_private = ctx; fc->s_fs_info = sbi; fc->ops = &autofs_context_ops; return 0; nomem_ctx: kfree(ctx); nomem: return -ENOMEM; } struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (inode == NULL) return NULL; inode->i_mode = mode; if (sb->s_root) { inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } simple_inode_init_ts(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { set_nlink(inode, 2); inode->i_op = &autofs_dir_inode_operations; inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); return inode; }
272 272 272 272 271 272 9 169 9 165 163 11 7 62 24 62 1 24 76 17 72 15 15 2 170 168 168 2 2 6 6 2 4 6 6 6 1 6 15 15 10 10 10 9 9 13 15 15 22 22 13 15 15 22 487 487 485 15 15 15 4 4 4 4 20 21 21 20 21 30 51 51 51 33 35 12 7 5 39 42 10 6 47 4 1 1 1 1 1 51 51 1 50 44 7 50 1 1 9 44 34 34 34 20 20 3 13 15 27 28 6 4 4 4 3 2 9 2 13 2 7 2 8 2 8 34 34 2 59 59 33 52 57 2 10 57 1 3 2 9 3 11 111 112 112 51 92 52 42 28 9 18 94 12 15 1 16 2 104 7 112 112 106 10 16 16 19 2 18 3 7 83 84 84 82 83 134 95 30 17 39 38 2 35 17 5 1 15 54 3 31 130 7 31 133 104 1 103 103 103 131 130 9 4 44 90 135 13 47 78 16 1 34 86 56 44 127 6 30 31 31 31 104 66 53 104 104 104 102 135 106 29 31 104 31 30 131 43 3 32 13 14 39 167 167 168 166 167 296 295 296 254 111 241 106 59 59 52 49 12 1 12 12 59 50 13 49 12 12 3 12 12 12 134 134 120 15 47 88 88 47 88 135 2 130 130 129 9 131 4 51 93 130 100 100 50 69 21 2 21 21 21 140 140 137 10 137 138 137 36 128 134 10 100 101 101 69 63 101 3 94 37 80 92 4 3 100 100 99 50 50 2 49 25 30 2 46 46 2 44 39 9 46 2 50 50 2 47 23 23 23 23 23 23 16 16 16 5 2 9 1 1 16 7 7 6 3 2 58 58 58 43 19 56 3 43 1 10 6 3 10 20 4 36 36 58 20 39 64 64 64 3 64 61 3 63 3 59 2 49 1 18 51 19 60 3 58 57 7 8 64 3 64 63 64 13 57 62 3 25 9 1 3 5 14 1 13 5 5 1 1 4 4 4 1 5 1 4 12 2 10 12 11 1 12 8 10 8 10 8 11 1 2 10 12 11 1 1 1 1 1 1 26 26 20 20 26 26 23 20 23 23 23 23 23 29 27 2 136 3 139 138 4 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rtbitmap.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" #include "xfs_trace.h" #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" #include "xfs_rtgroup.h" struct kmem_cache *xfs_bmap_intent_cache; /* * Miscellaneous helper functions */ /* * Compute and fill in the value of the maximum depth of a bmap btree * in this filesystem. Done once, during mount. */ void xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { uint64_t maxblocks; /* max blocks at this level */ xfs_extnum_t maxleafents; /* max leaf entries possible */ int level; /* btree level */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ int sz; /* root block size */ /* * The maximum number of extents in a fork, hence the maximum number of * leaf entries, is controlled by the size of the on-disk extent count. * * Note that we can no longer assume that if we are in ATTR1 that the * fork offset of all the inodes will be * (xfs_default_attroffset(ip) >> 3) because we could have mounted with * ATTR2 and then mounted back with ATTR1, keeping the i_forkoff's fixed * but probably at various positions. Therefore, for both ATTR1 and * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), whichfork); if (whichfork == XFS_DATA_FORK) sz = xfs_bmdr_space_calc(MINDBTPTRS); else sz = xfs_bmdr_space_calc(MINABTPTRS); maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; else maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); } unsigned int xfs_bmap_compute_attr_offset( struct xfs_mount *mp) { if (mp->m_sb.sb_inodesize == 256) return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); return xfs_bmdr_space_calc(6 * MINABTPTRS); } STATIC int /* error */ xfs_bmbt_lookup_eq( struct xfs_btree_cur *cur, struct xfs_bmbt_irec *irec, int *stat) /* success/failure */ { cur->bc_rec.b = *irec; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } STATIC int /* error */ xfs_bmbt_lookup_first( struct xfs_btree_cur *cur, int *stat) /* success/failure */ { cur->bc_rec.b.br_startoff = 0; cur->bc_rec.b.br_startblock = 0; cur->bc_rec.b.br_blockcount = 0; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } /* * Check if the inode needs to be converted to btree format. */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork); } /* * Check if the inode should be converted to extent format. */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork); } /* * Update the record referred to by cur to the value given by irec * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int xfs_bmbt_update( struct xfs_btree_cur *cur, struct xfs_bmbt_irec *irec) { union xfs_btree_rec rec; xfs_bmbt_disk_set_all(&rec.bmbt, irec); return xfs_btree_update(cur, &rec); } /* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ STATIC xfs_filblks_t xfs_bmap_worst_indlen( xfs_inode_t *ip, /* incore inode pointer */ xfs_filblks_t len) /* delayed extent length */ { int level; /* btree level number */ int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ mp = ip->i_mount; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { len += maxrecs - 1; do_div(len, maxrecs); rval += len; if (len == 1) return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } return rval; } /* * Calculate the default attribute fork offset for newly created inodes. */ uint xfs_default_attroffset( struct xfs_inode *ip) { if (ip->i_df.if_format == XFS_DINODE_FMT_DEV) return roundup(sizeof(xfs_dev_t), 8); return M_IGEO(ip->i_mount)->attr_fork_offset; } /* * Helper routine to reset inode i_forkoff field when switching attribute fork * from local to extent format - we reset it where possible to make space * available for inline data fork extents. */ STATIC void xfs_bmap_forkoff_reset( xfs_inode_t *ip, int whichfork) { if (whichfork == XFS_ATTR_FORK && ip->i_df.if_format != XFS_DINODE_FMT_DEV && ip->i_df.if_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; if (dfl_forkoff > ip->i_forkoff) ip->i_forkoff = dfl_forkoff; } } static int xfs_bmap_read_buf( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t fsbno, /* file system block number */ struct xfs_buf **bpp) /* buffer for fsbno */ { struct xfs_buf *bp; /* return value */ int error; if (!xfs_verify_fsbno(mp, fsbno)) return -EFSCORRUPTED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp, &xfs_bmbt_buf_ops); if (!error) { xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); *bpp = bp; } return error; } #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( struct xfs_btree_cur *cur, xfs_fsblock_t bno) { struct xfs_log_item *lip; int i; if (!cur) return NULL; for (i = 0; i < cur->bc_maxlevels; i++) { if (!cur->bc_levels[i].bp) break; if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno) return cur->bc_levels[i].bp; } /* Chase down all the log items to see if the bp is there */ list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) { struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; if (bip->bli_item.li_type == XFS_LI_BUF && xfs_buf_daddr(bip->bli_buf) == bno) return bip->bli_buf; } return NULL; } STATIC void xfs_check_block( struct xfs_btree_block *block, xfs_mount_t *mp, int root, short sz) { int i, j, dmxr; __be64 *pp, *thispa; /* pointer to block address */ xfs_bmbt_key_t *prevp, *keyp; ASSERT(be16_to_cpu(block->bb_level) > 0); prevp = NULL; for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { dmxr = mp->m_bmap_dmxr[0]; keyp = xfs_bmbt_key_addr(mp, block, i); if (prevp) { ASSERT(be64_to_cpu(prevp->br_startoff) < be64_to_cpu(keyp->br_startoff)); } prevp = keyp; /* * Compare the block numbers to see if there are dups. */ if (root) pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz); else pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr); for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { if (root) thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz); else thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr); if (*thispa == *pp) { xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } } } } /* * Check that the extents for the inode ip are in the right order in all * btree leaves. THis becomes prohibitively expensive for large extent count * files, so don't bother with inodes that have more than 10,000 extents in * them. The btree record ordering checks will still be done, so for such large * bmapbt constructs that is going to catch most corruptions. */ STATIC void xfs_bmap_check_leaf_extents( struct xfs_btree_cur *cur, /* btree cursor or null */ xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ struct xfs_buf *bp; /* buffer for "block" */ int error; /* error return value */ xfs_extnum_t i=0, j; /* index into the extents list */ int level; /* btree level, for checking */ __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; if (ifp->if_format != XFS_DINODE_FMT_BTREE) return; /* skip large extent count inodes */ if (ip->i_df.if_nextents > 10000) return; bno = NULLFSBLOCK; block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ level = be16_to_cpu(block->bb_level); ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLFSBLOCK); ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. */ while (level-- > 0) { /* See if buf is in cur first */ bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; error = xfs_bmap_read_buf(mp, NULL, bno, &bp); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) goto error_norelse; } block = XFS_BUF_TO_BLOCK(bp); if (level == 0) break; /* * Check this block for basic sanity (increasing keys and * no duplicate blocks). */ xfs_check_block(block, mp, 0, 0); pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); } } /* * Here with bp and block set to the leftmost leaf node in the tree. */ i = 0; /* * Loop over all leaf nodes checking that all extents are in the right order. */ for (;;) { xfs_fsblock_t nextbno; xfs_extnum_t num_recs; num_recs = xfs_btree_get_numrecs(block); /* * Read-ahead the next leaf block, if any. */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); /* * Check all the extents to make sure they are OK. * If we had a previous block, the last entry should * conform with the first entry in this one. */ ep = xfs_bmbt_rec_addr(mp, block, 1); if (i) { ASSERT(xfs_bmbt_disk_get_startoff(&last) + xfs_bmbt_disk_get_blockcount(&last) <= xfs_bmbt_disk_get_startoff(ep)); } for (j = 1; j < num_recs; j++) { nextp = xfs_bmbt_rec_addr(mp, block, j + 1); ASSERT(xfs_bmbt_disk_get_startoff(ep) + xfs_bmbt_disk_get_blockcount(ep) <= xfs_bmbt_disk_get_startoff(nextp)); ep = nextp; } last = *ep; i += num_recs; if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); } bno = nextbno; /* * If we've reached the end, stop. */ if (bno == NULLFSBLOCK) break; bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; error = xfs_bmap_read_buf(mp, NULL, bno, &bp); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) goto error_norelse; } block = XFS_BUF_TO_BLOCK(bp); } return; error0: xfs_warn(mp, "%s: at error0", __func__); if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return; } /* * Validate that the bmbt_irecs being returned from bmapi are valid * given the caller's original parameters. Specifically check the * ranges of the returned irecs to ensure that they only extend beyond * the given parameters if the XFS_BMAPI_ENTIRE flag was set. */ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) { int i; /* index to map values */ ASSERT(ret_nmap <= nmap); for (i = 0; i < ret_nmap; i++) { ASSERT(mval[i].br_blockcount > 0); if (!(flags & XFS_BMAPI_ENTIRE)) { ASSERT(mval[i].br_startoff >= bno); ASSERT(mval[i].br_blockcount <= len); ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= bno + len); } else { ASSERT(mval[i].br_startoff < bno + len); ASSERT(mval[i].br_startoff + mval[i].br_blockcount > bno); } ASSERT(i == 0 || mval[i - 1].br_startoff + mval[i - 1].br_blockcount == mval[i].br_startoff); ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && mval[i].br_startblock != HOLESTARTBLOCK); ASSERT(mval[i].br_state == XFS_EXT_NORM || mval[i].br_state == XFS_EXT_UNWRITTEN); } } #else #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) #define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) #endif /* DEBUG */ /* * Inode fork format manipulation functions */ /* * Convert the inode format to extent format if it currently is in btree format, * but the extent list is small enough that it fits into the extent format. * * Since the extents are already in-core, all we have to do is give up the space * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode pointer */ struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ struct xfs_buf *cbp; /* child block's buffer */ int error; /* error return value */ __be64 *pp; /* ptr to block address */ struct xfs_owner_info oinfo; /* check if we actually need the extent format first: */ if (!xfs_bmap_wants_extents(ip, whichfork)) return 0; ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1); pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } #endif error = xfs_bmap_read_buf(mp, tp, cbno, &cbp); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, XFS_AG_RESV_NONE, 0); if (error) return error; ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); if (cur->bc_levels[0].bp == cbp) cur->bc_levels[0].bp = NULL; xfs_bmap_broot_realloc(ip, whichfork, 0); ASSERT(ifp->if_broot == NULL); ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } /* * Convert an extents-format file into a btree-format file. * The new file will have a root block (in the inode) and a single child block. */ STATIC int /* error */ xfs_bmap_extents_to_btree( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode pointer */ struct xfs_btree_cur **curp, /* cursor returned to caller */ int wasdel, /* converting a delayed alloc */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { struct xfs_btree_block *ablock; /* allocated (child) bt block */ struct xfs_buf *abp; /* buffer for ablock */ struct xfs_alloc_arg args; /* allocation arguments */ struct xfs_bmbt_rec *arp; /* child record pointer */ struct xfs_btree_block *block; /* btree root block */ struct xfs_btree_cur *cur; /* bmap btree cursor */ int error; /* error return value */ struct xfs_ifork *ifp; /* inode fork pointer */ struct xfs_bmbt_key *kp; /* root block key pointer */ struct xfs_mount *mp; /* mount structure */ xfs_bmbt_ptr_t *pp; /* root block address pointer */ struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; xfs_extnum_t cnt = 0; mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* * Make space in the inode incore. This needs to be undone if we fail * to expand the root. */ block = xfs_bmap_broot_realloc(ip, whichfork, 1); /* * Fill in the root. */ xfs_bmbt_init_block(ip, block, NULL, 1, 1); /* * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); if (wasdel) cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; /* * Convert to a btree with two levels, one record in root. */ ifp->if_format = XFS_DINODE_FMT_BTREE; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork); args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; error = xfs_alloc_vextent_start_ag(&args, XFS_INO_TO_FSB(mp, ip->i_ino)); if (error) goto out_root_realloc; /* * Allocation can't fail, the space was reserved. */ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { error = -ENOSPC; goto out_root_realloc; } cur->bc_bmap.allocated++; ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, args.fsbno), mp->m_bsize, 0, &abp); if (error) goto out_unreserve_dquot; /* * Fill in the child block. */ ablock = XFS_BUF_TO_BLOCK(abp); xfs_bmbt_init_block(ip, ablock, abp, 0, 0); for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) continue; arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt); xfs_bmbt_disk_set_all(arp, &rec); cnt++; } ASSERT(cnt == ifp->if_nextents); xfs_btree_set_numrecs(ablock, cnt); /* * Fill in the root key and pointer. */ kp = xfs_bmbt_key_addr(mp, block, 1); arp = xfs_bmbt_rec_addr(mp, ablock, 1); kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur, be16_to_cpu(block->bb_level))); *pp = cpu_to_be64(args.fsbno); /* * Do all this logging at the end so that * the root is at the right level. */ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); ASSERT(*curp == NULL); *curp = cur; *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: xfs_bmap_broot_realloc(ip, whichfork, 0); ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Convert a local file to an extents file. * This code is out of bounds for data forks of regular files, * since the file data needs to get logged so things will stay consistent. * (The bmap-level manipulations are ok, though). */ void xfs_bmap_local_to_extents_empty( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_data = NULL; ifp->if_height = 0; ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } int /* error */ xfs_bmap_local_to_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ int whichfork, void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp, void *priv), void *priv) { int error = 0; int flags; /* logging flags returned */ struct xfs_ifork *ifp; /* inode fork pointer */ xfs_alloc_arg_t args; /* allocation arguments */ struct xfs_buf *bp; /* buffer for extent block */ struct xfs_bmbt_irec rec; struct xfs_iext_cursor icur; /* * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags = XFS_ILOG_CORE; goto done; } flags = 0; error = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.total = total; args.minlen = args.maxlen = args.prod = 1; xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0); /* * Allocate a block. We know we need only one, since the * file currently fits in an inode. */ args.total = total; args.minlen = args.maxlen = args.prod = 1; error = xfs_alloc_vextent_start_ag(&args, XFS_INO_TO_FSB(args.mp, ip->i_ino)); if (error) goto done; /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, XFS_FSB_TO_DADDR(args.mp, args.fsbno), args.mp->m_bsize, 0, &bp); if (error) goto done; /* * Initialize the block, copy the data and log the remote buffer. * * The callout is responsible for logging because the remote format * might differ from the local format and thus we don't know how much to * log here. Note that init_fn must also set the buffer log item type * correctly. */ init_fn(tp, bp, ip, ifp, priv); /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; ifp->if_data = NULL; ifp->if_height = 0; rec.br_startoff = 0; rec.br_startblock = args.fsbno; rec.br_blockcount = 1; rec.br_state = XFS_EXT_NORM; xfs_iext_first(ifp, &icur); xfs_iext_insert(ip, &icur, &rec, 0); ifp->if_nextents = 1; ip->i_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); flags |= xfs_ilog_fext(whichfork); done: *logflagsp = flags; return error; } /* * Called from xfs_bmap_add_attrfork to handle btree format files. */ STATIC int /* error */ xfs_bmap_add_attrfork_btree( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { struct xfs_btree_block *block = ip->i_df.if_broot; struct xfs_btree_cur *cur; /* btree cursor */ int error; /* error return value */ xfs_mount_t *mp; /* file system mount struct */ int stat; /* newroot status */ mp = ip->i_mount; if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); error = xfs_bmbt_lookup_first(cur, &stat); if (error) goto error0; /* must be at least one entry */ if (XFS_IS_CORRUPT(mp, stat != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Called from xfs_bmap_add_attrfork to handle extents format files. */ STATIC int /* error */ xfs_bmap_add_attrfork_extents( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ int error; /* error return value */ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= xfs_inode_data_fork_size(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; } /* * Called from xfs_bmap_add_attrfork to handle local format files. Each * different data fork content type needs a different callout to do the * conversion. Some are basic and only require special block initialisation * callouts for the data formating, others (directories) are so specialised they * handle everything themselves. * * XXX (dgc): investigate whether directory conversion can use the generic * formatting callout. It should be possible - it's just a very complex * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { struct xfs_da_args dargs; /* args for dir/attr code */ if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip)) return 0; if (S_ISDIR(VFS_I(ip)->i_mode)) { memset(&dargs, 0, sizeof(dargs)); dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; dargs.owner = ip->i_ino; return xfs_dir2_sf_to_block(&dargs); } if (S_ISLNK(VFS_I(ip)->i_mode)) return xfs_bmap_local_to_extents(tp, ip, 1, flags, XFS_DATA_FORK, xfs_symlink_local_to_remote, NULL); /* should only be called for types that support local format data */ ASSERT(0); xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; } /* * Set an inode attr fork offset based on the format of the data fork. */ static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, int size, int *version) { int default_size = xfs_default_attroffset(ip) >> 3; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: ip->i_forkoff = default_size; break; case XFS_DINODE_FMT_LOCAL: case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; else if (xfs_has_attr2(ip->i_mount) && version) *version = 2; break; default: ASSERT(0); return -EINVAL; } return 0; } /* * Convert inode from non-attributed to attributed. Caller must hold the * ILOCK_EXCL and the file cannot have an attr fork. */ int /* error code */ xfs_bmap_add_attrfork( struct xfs_trans *tp, struct xfs_inode *ip, /* incore inode pointer */ int size, /* space new attribute needs */ int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (!xfs_is_metadir_inode(ip)) ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) return error; xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &logflags); break; case XFS_DINODE_FMT_EXTENTS: error = xfs_bmap_add_attrfork_extents(tp, ip, &logflags); break; case XFS_DINODE_FMT_BTREE: error = xfs_bmap_add_attrfork_btree(tp, ip, &logflags); break; default: error = 0; break; } if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) return error; if (!xfs_has_attr(mp) || (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); log_sb = true; } if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } spin_unlock(&mp->m_sb_lock); if (log_sb) xfs_log_sb(tp); } return 0; } /* * Internal and external extent tree search functions. */ struct xfs_iread_state { struct xfs_iext_cursor icur; xfs_extnum_t loaded; }; int xfs_bmap_complain_bad_rec( struct xfs_inode *ip, int whichfork, xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec) { struct xfs_mount *mp = ip->i_mount; const char *forkname; switch (whichfork) { case XFS_DATA_FORK: forkname = "data"; break; case XFS_ATTR_FORK: forkname = "attr"; break; case XFS_COW_FORK: forkname = "CoW"; break; default: forkname = "???"; break; } xfs_warn(mp, "Bmap BTree record corruption in inode 0x%llx %s fork detected at %pS!", ip->i_ino, forkname, fa); xfs_warn(mp, "Offset 0x%llx, start block 0x%llx, block count 0x%llx state 0x%x", irec->br_startoff, irec->br_startblock, irec->br_blockcount, irec->br_state); return -EFSCORRUPTED; } /* Stuff every bmbt record from this block into the incore extent map. */ static int xfs_iread_bmbt_block( struct xfs_btree_cur *cur, int level, void *priv) { struct xfs_iread_state *ir = priv; struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_btree_block *block; struct xfs_buf *bp; struct xfs_bmbt_rec *frp; xfs_extnum_t num_recs; xfs_extnum_t j; int whichfork = cur->bc_ino.whichfork; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); /* Abort if we find more records than nextents. */ num_recs = xfs_btree_get_numrecs(block); if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) { xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, sizeof(*block), __this_address); xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } /* Copy records into the incore cache. */ frp = xfs_bmbt_rec_addr(mp, block, 1); for (j = 0; j < num_recs; j++, frp++, ir->loaded++) { struct xfs_bmbt_irec new; xfs_failaddr_t fa; xfs_bmbt_disk_get_all(frp, &new); fa = xfs_bmap_validate_extent(ip, whichfork, &new); if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); xfs_bmap_mark_sick(ip, whichfork); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } xfs_iext_insert(ip, &ir->icur, &new, xfs_bmap_fork_to_state(whichfork)); trace_xfs_read_extent(ip, &ir->icur, xfs_bmap_fork_to_state(whichfork), _THIS_IP_); xfs_iext_next(ifp, &ir->icur); } return 0; } /* * Read in extents from a btree-format inode. */ int xfs_iread_extents( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork) { struct xfs_iread_state ir; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur; int error; if (!xfs_need_iread_extents(ifp)) return 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ir.loaded = 0; xfs_iext_first(ifp, &ir.icur); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block, XFS_BTREE_VISIT_RECORDS, &ir); xfs_btree_del_cursor(cur, error); if (error) goto out; if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); /* * Use release semantics so that we can use acquire semantics in * xfs_need_iread_extents and be guaranteed to see a valid mapping tree * after that load. */ smp_store_release(&ifp->if_needextents, 0); return 0; out: if (xfs_metadata_is_sick(error)) xfs_bmap_mark_sick(ip, whichfork); xfs_iext_destroy(ifp); return error; } /* * Returns the relative block number of the first unused block(s) in the given * fork with at least "len" logically contiguous blocks free. This is the * lowest-address hole if the fork has holes, else the first block past the end * of fork. Return 0 if the fork is currently local (in-inode). */ int /* error */ xfs_bmap_first_unused( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_extlen_t len, /* size of hole to find */ xfs_fileoff_t *first_unused, /* unused block */ int whichfork) /* data or attr fork */ { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fileoff_t lastaddr = 0; xfs_fileoff_t lowest, max; int error; if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } ASSERT(xfs_ifork_has_extents(ifp)); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; lowest = max = *first_unused; for_each_xfs_iext(ifp, &icur, &got) { /* * See if the hole before this extent will work. */ if (got.br_startoff >= lowest + len && got.br_startoff - max >= len) break; lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } *first_unused = max; return 0; } /* * Returns the file-relative block number of the last block - 1 before * last_block (input value) in the file. * This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. */ int /* error */ xfs_bmap_last_before( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t *last_block, /* last block */ int whichfork) /* data or attr fork */ { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; int error; switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; case XFS_DINODE_FMT_BTREE: case XFS_DINODE_FMT_EXTENTS: break; default: ASSERT(0); xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got)) *last_block = 0; return 0; } int xfs_bmap_last_extent( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *rec, int *is_empty) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; int error; error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; xfs_iext_last(ifp, &icur); if (!xfs_iext_get_extent(ifp, &icur, rec)) *is_empty = 1; else *is_empty = 0; return 0; } /* * Check the last inode extent to determine whether this allocation will result * in blocks being allocated at the end of the file. When we allocate new data * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. * * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be * at, or past the EOF. */ STATIC int xfs_bmap_isaeof( struct xfs_bmalloca *bma, int whichfork) { struct xfs_bmbt_irec rec; int is_empty; int error; bma->aeof = false; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); if (error) return error; if (is_empty) { bma->aeof = true; return 0; } /* * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. */ bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || (bma->offset >= rec.br_startoff && isnullstartblock(rec.br_startblock)); return 0; } /* * Returns the file-relative block number of the first block past eof in * the file. This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. */ int xfs_bmap_last_offset( struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; *last_block = 0; if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) return error; *last_block = rec.br_startoff + rec.br_blockcount; return 0; } /* * Extent tree manipulation functions used during allocation. */ static inline bool xfs_bmap_same_rtgroup( struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *left, struct xfs_bmbt_irec *right) { struct xfs_mount *mp = ip->i_mount; if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) { if (xfs_rtb_to_rgno(mp, left->br_startblock) != xfs_rtb_to_rgno(mp, right->br_startblock)) return false; } return true; } /* * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( struct xfs_bmalloca *bma, int whichfork) { struct xfs_mount *mp = bma->ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ xfs_fileoff_t new_endoff; /* end offset of new entry */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ int tmp_rval; /* partial logging flags */ struct xfs_bmbt_irec old; ASSERT(whichfork != XFS_ATTR_FORK); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] /* * Set up a bunch of variables to make the tests simpler. */ xfs_iext_get_extent(ifp, &bma->icur, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(isnullstartblock(PREV.br_startblock)); ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); da_old = startblockval(PREV.br_startblock); da_new = 0; /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. */ if (PREV.br_startoff == new->br_startoff) state |= BMAP_LEFT_FILLING; if (PREV.br_startoff + PREV.br_blockcount == new_endoff) state |= BMAP_RIGHT_FILLING; /* * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) { state |= BMAP_LEFT_VALID; if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; } if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* * Check and set flags if this segment has a right neighbor. * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) { state |= BMAP_RIGHT_VALID; if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; error = 0; /* * Switch out based on the FILLING and CONTIG state bits. */ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount; xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); ifp->if_nextents--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_delete(bma->cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ old = LEFT; LEFT.br_blockcount += PREV.br_blockcount; xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. Take care * with delay -> unwritten extent allocation here because the * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(bma->cur, &PREV); if (error) goto done; } ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Filling in all of a previously delayed allocation extent. * Neither the left nor right neighbors are contiguous with * the new one. */ PREV.br_startblock = new->br_startblock; PREV.br_state = new->br_state; xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } } ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: /* * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ old = LEFT; temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); LEFT.br_blockcount += new->br_blockcount; PREV.br_blockcount = temp; PREV.br_startoff += new->br_blockcount; PREV.br_startblock = nullstartblock(da_new); xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(bma->cur, &LEFT); if (error) goto done; } ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING: /* * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; } temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; PREV.br_startblock = nullstartblock(da_new); xfs_iext_next(ifp, &bma->icur); xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_prev(ifp, &bma->icur); break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in the last part of a previous delayed allocation. * The right neighbor is contiguous with the new allocation. */ old = RIGHT; RIGHT.br_startoff = new->br_startoff; RIGHT.br_startblock = new->br_startblock; RIGHT.br_blockcount += new->br_blockcount; if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(bma->cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(bma->cur, &RIGHT); if (error) goto done; } temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock)); PREV.br_blockcount = temp; PREV.br_startblock = nullstartblock(da_new); xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); ASSERT(da_new <= da_old); break; case BMAP_RIGHT_FILLING: /* * Filling in the last part of a previous delayed allocation. * The right neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; } temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_next(ifp, &bma->icur); ASSERT(da_new <= da_old); break; case 0: /* * Filling in the middle part of a previous delayed allocation. * Contiguity is impossible here. * This case is avoided almost all the time. * * We start with a delayed allocation: * * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ * PREV @ idx * * and we are allocating: * +rrrrrrrrrrrrrrrrr+ * new * * and we set it up for insertion as: * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ * new * PREV @ idx LEFT RIGHT * inserted at idx + 1 */ old = PREV; /* LEFT is the new middle */ LEFT = *new; /* RIGHT is the new right */ RIGHT.br_state = PREV.br_state; RIGHT.br_startoff = new_endoff; RIGHT.br_blockcount = PREV.br_startoff + PREV.br_blockcount - new_endoff; RIGHT.br_startblock = nullstartblock(xfs_bmap_worst_indlen(bma->ip, RIGHT.br_blockcount)); /* truncate PREV */ PREV.br_blockcount = new->br_startoff - PREV.br_startoff; PREV.br_startblock = nullstartblock(xfs_bmap_worst_indlen(bma->ip, PREV.br_blockcount)); xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(bma->cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_insert(bma->cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } } if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; } da_new = startblockval(PREV.br_startblock) + startblockval(RIGHT.br_startblock); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: case BMAP_LEFT_CONTIG: case BMAP_RIGHT_CONTIG: /* * These cases are all impossible. */ ASSERT(0); } /* add reverse mapping unless caller opted out */ if (!(bma->flags & XFS_BMAPI_NORMAP)) xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, &bma->cur, da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) goto done; } if (da_new != da_old) xfs_mod_delalloc(bma->ip, 0, (int64_t)da_new - da_old); if (bma->cur) { da_new += bma->cur->bc_bmap.allocated; bma->cur->bc_bmap.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ if (da_new < da_old) xfs_add_fdblocks(mp, da_old - da_new); else if (da_new > da_old) error = xfs_dec_fdblocks(mp, da_new - da_old, true); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: if (whichfork != XFS_COW_FORK) bma->logflags |= rval; return error; #undef LEFT #undef RIGHT #undef PREV } /* * Convert an unwritten allocation to a real allocation or vice versa. */ int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp) /* inode logging flags */ { struct xfs_btree_cur *cur; /* btree cursor */ int error; /* error return value */ int i; /* temp state */ struct xfs_ifork *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; *logflagsp = 0; cur = *curp; ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] /* * Set up a bunch of variables to make the tests simpler. */ error = 0; xfs_iext_get_extent(ifp, icur, &PREV); ASSERT(new->br_state != PREV.br_state); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); /* * Set flags determining what part of the previous oldext allocation * extent is being replaced by a newext allocation. */ if (PREV.br_startoff == new->br_startoff) state |= BMAP_LEFT_FILLING; if (PREV.br_startoff + PREV.br_blockcount == new_endoff) state |= BMAP_RIGHT_FILLING; /* * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) { state |= BMAP_LEFT_VALID; if (isnullstartblock(LEFT.br_startblock)) state |= BMAP_LEFT_DELAY; } if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* * Check and set flags if this segment has a right neighbor. * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) { state |= BMAP_RIGHT_VALID; if (isnullstartblock(RIGHT.br_startblock)) state |= BMAP_RIGHT_DELAY; } if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; /* * Switch out based on the FILLING and CONTIG state bits. */ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount; xfs_iext_remove(ip, icur, state); xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); ifp->if_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; } break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ LEFT.br_blockcount += PREV.br_blockcount; xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); ifp->if_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; } break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ PREV.br_blockcount += RIGHT.br_blockcount; PREV.br_state = new->br_state; xfs_iext_next(ifp, icur); xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &PREV); ifp->if_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; } break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Setting all of a previous oldext extent to newext. * Neither the left nor right neighbors are contiguous with * the new one. */ PREV.br_state = new->br_state; xfs_iext_update_extent(ip, state, icur, &PREV); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; } break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: /* * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ LEFT.br_blockcount += new->br_blockcount; old = PREV; PREV.br_startoff += new->br_blockcount; PREV.br_startblock += new->br_blockcount; PREV.br_blockcount -= new->br_blockcount; xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; error = xfs_bmbt_update(cur, &LEFT); if (error) goto done; } break; case BMAP_LEFT_FILLING: /* * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ old = PREV; PREV.br_startoff += new->br_blockcount; PREV.br_startblock += new->br_blockcount; PREV.br_blockcount -= new->br_blockcount; xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_insert(ip, icur, new, state); ifp->if_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } } break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ old = PREV; PREV.br_blockcount -= new->br_blockcount; RIGHT.br_startoff = new->br_startoff; RIGHT.br_startblock = new->br_startblock; RIGHT.br_blockcount += new->br_blockcount; xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_update_extent(ip, state, icur, &RIGHT); if (cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; error = xfs_btree_increment(cur, 0, &i); if (error) goto done; error = xfs_bmbt_update(cur, &RIGHT); if (error) goto done; } break; case BMAP_RIGHT_FILLING: /* * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ old = PREV; PREV.br_blockcount -= new->br_blockcount; xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, new, state); ifp->if_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &PREV); if (error) goto done; error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } } break; case 0: /* * Setting the middle part of a previous oldext extent to * newext. Contiguity is impossible here. * One extent becomes three extents. */ old = PREV; PREV.br_blockcount = new->br_startoff - PREV.br_startoff; r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = old.br_startoff + old.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = PREV.br_state; xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &r[1], state); xfs_iext_insert(ip, icur, &r[0], state); ifp->if_nextents += 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } /* new right extent - oldext */ error = xfs_bmbt_update(cur, &r[1]); if (error) goto done; /* new left extent - oldext */ cur->bc_rec.b = PREV; if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after * the previous insert. */ error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } /* new middle extent - newext */ if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } } break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: case BMAP_LEFT_CONTIG: case BMAP_RIGHT_CONTIG: /* * These cases are all impossible. */ ASSERT(0); } /* update reverse mappings */ xfs_rmap_convert_extent(mp, tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; if (error) goto done; } /* clear out the allocated field, done with it now in any case. */ if (cur) { cur->bc_bmap.allocated = 0; *curp = cur; } xfs_bmap_check_leaf_extents(*curp, ip, whichfork); done: *logflagsp |= rval; return error; #undef LEFT #undef RIGHT #undef PREV } /* * Convert a hole to a delayed allocation. */ STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ int whichfork, struct xfs_iext_cursor *icur, xfs_bmbt_irec_t *new) /* new data to add to file extents */ { struct xfs_ifork *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { state |= BMAP_LEFT_VALID; if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } /* * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ if (xfs_iext_get_extent(ifp, icur, &right)) { state |= BMAP_RIGHT_VALID; if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } /* * Set contiguity flags on the left and right neighbors. * Don't let extents get too large, even if the pieces are contiguous. */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* * Switch out based on the contiguity flags. */ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with delayed allocations * on the left and on the right. * Merge all three into a single extent record. */ temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); left.br_startblock = nullstartblock(newlen); left.br_blockcount = temp; xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); break; case BMAP_LEFT_CONTIG: /* * New allocation is contiguous with a delayed allocation * on the left. * Merge the new allocation with the left neighbor. */ temp = left.br_blockcount + new->br_blockcount; oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); left.br_blockcount = temp; left.br_startblock = nullstartblock(newlen); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); break; case BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with a delayed allocation * on the right. * Merge the new allocation with the right neighbor. */ temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), oldlen); right.br_startoff = new->br_startoff; right.br_startblock = nullstartblock(newlen); right.br_blockcount = temp; xfs_iext_update_extent(ip, state, icur, &right); break; case 0: /* * New allocation is not contiguous with another * delayed allocation. * Insert a new entry. */ oldlen = newlen = 0; xfs_iext_insert(ip, icur, new, state); break; } if (oldlen != newlen) { ASSERT(oldlen > newlen); xfs_add_fdblocks(ip->i_mount, oldlen - newlen); /* * Nothing to do for disk quota accounting here. */ xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); } } /* * Convert a hole to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, uint32_t flags) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur = *curp; int error; /* error return value */ int i; /* temp state */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); /* * Check and set flags if this segment has a left neighbor. */ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { state |= BMAP_LEFT_VALID; if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } /* * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ if (xfs_iext_get_extent(ifp, icur, &right)) { state |= BMAP_RIGHT_VALID; if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } /* * We're inserting a real allocation between "left" and "right". * Set the contiguity flags. Don't let extents get too large. */ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && xfs_bmap_same_rtgroup(ip, whichfork, &left, new)) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && xfs_bmap_same_rtgroup(ip, whichfork, new, &right)) state |= BMAP_RIGHT_CONTIG; error = 0; /* * Select which case we're in here, and implement it. */ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with real allocations on the * left and on the right. * Merge all three into a single extent record. */ left.br_blockcount += new->br_blockcount + right.br_blockcount; xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); ifp->if_nextents--; if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, &right, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_delete(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &left); if (error) goto done; } break; case BMAP_LEFT_CONTIG: /* * New allocation is contiguous with a real allocation * on the left. * Merge the new allocation with the left neighbor. */ old = left; left.br_blockcount += new->br_blockcount; xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &left); if (error) goto done; } break; case BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with a real allocation * on the right. * Merge the new allocation with the right neighbor. */ old = right; right.br_startoff = new->br_startoff; right.br_startblock = new->br_startblock; right.br_blockcount += new->br_blockcount; xfs_iext_update_extent(ip, state, icur, &right); if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; error = xfs_bmbt_lookup_eq(cur, &old, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_bmbt_update(cur, &right); if (error) goto done; } break; case 0: /* * New allocation is not contiguous with another * real allocation. * Insert a new entry. */ xfs_iext_insert(ip, icur, new, state); ifp->if_nextents++; if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; error = xfs_bmbt_lookup_eq(cur, new, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } error = xfs_btree_insert(cur, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } } break; } /* add reverse mapping unless caller opted out */ if (!(flags & XFS_BMAPI_NORMAP)) xfs_rmap_map_extent(tp, ip, whichfork, new); /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, curp, 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; cur = *curp; if (error) goto done; } /* clear out the allocated field, done with it now in any case. */ if (cur) cur->bc_bmap.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: *logflagsp |= rval; return error; } /* * Functions used in the extent read, allocate and remove paths */ /* * Adjust the size of the new extent based on i_extsize and rt extsize. */ int xfs_bmap_extsize_align( xfs_mount_t *mp, xfs_bmbt_irec_t *gotp, /* next extent pointer */ xfs_bmbt_irec_t *prevp, /* previous extent pointer */ xfs_extlen_t extsz, /* align to this extent size */ int rt, /* is this a realtime inode? */ int eof, /* is extent at end-of-file? */ int delay, /* creating delalloc extent? */ int convert, /* overwriting unwritten extent? */ xfs_fileoff_t *offp, /* in/out: aligned offset */ xfs_extlen_t *lenp) /* in/out: aligned length */ { xfs_fileoff_t orig_off; /* original offset */ xfs_extlen_t orig_alen; /* original length */ xfs_fileoff_t orig_end; /* original off+len */ xfs_fileoff_t nexto; /* next file offset */ xfs_fileoff_t prevo; /* previous file offset */ xfs_fileoff_t align_off; /* temp for offset */ xfs_extlen_t align_alen; /* temp for length */ xfs_extlen_t temp; /* temp for calculations */ if (convert) return 0; orig_off = align_off = *offp; orig_alen = align_alen = *lenp; orig_end = orig_off + orig_alen; /* * If this request overlaps an existing extent, then don't * attempt to perform any additional alignment. */ if (!delay && !eof && (orig_off >= gotp->br_startoff) && (orig_end <= gotp->br_startoff + gotp->br_blockcount)) { return 0; } /* * If the file offset is unaligned vs. the extent size * we need to align it. This will be possible unless * the file was previously written with a kernel that didn't * perform this alignment, or if a truncate shot us in the * foot. */ div_u64_rem(orig_off, extsz, &temp); if (temp) { align_alen += temp; align_off -= temp; } /* Same adjustment for the end of the requested area. */ temp = (align_alen % extsz); if (temp) align_alen += extsz - temp; /* * For large extent hint sizes, the aligned extent might be larger than * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer * allocation loops handle short allocation just fine, so it is safe to * do this. We only want to do it when we are forced to, though, because * it means more allocation operations are required. */ while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation * then move the start forward without adjusting the length. */ if (prevp->br_startoff != NULLFILEOFF) { if (prevp->br_startblock == HOLESTARTBLOCK) prevo = prevp->br_startoff; else prevo = prevp->br_startoff + prevp->br_blockcount; } else prevo = 0; if (align_off != orig_off && align_off < prevo) align_off = prevo; /* * If the next block overlaps with this proposed allocation * then move the start back without adjusting the length, * but not before offset 0. * This may of course make the start overlap previous block, * and if we hit the offset 0 limit then the next block * can still overlap too. */ if (!eof && gotp->br_startoff != NULLFILEOFF) { if ((delay && gotp->br_startblock == HOLESTARTBLOCK) || (!delay && gotp->br_startblock == DELAYSTARTBLOCK)) nexto = gotp->br_startoff + gotp->br_blockcount; else nexto = gotp->br_startoff; } else nexto = NULLFILEOFF; if (!eof && align_off + align_alen != orig_end && align_off + align_alen > nexto) align_off = nexto > align_alen ? nexto - align_alen : 0; /* * If we're now overlapping the next or previous extent that * means we can't fit an extsz piece in this hole. Just move * the start forward to the first valid spot and set * the length so we hit the end. */ if (align_off != orig_off && align_off < prevo) align_off = prevo; if (align_off + align_alen != orig_end && align_off + align_alen > nexto && nexto != NULLFILEOFF) { ASSERT(nexto > prevo); align_alen = nexto - align_off; } /* * If realtime, and the result isn't a multiple of the realtime * extent size we need to remove blocks until it is. */ if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) { /* * We're not covering the original request, or * we won't be able to once we fix the length. */ if (orig_off < align_off || orig_end > align_off + align_alen || align_alen - temp < orig_alen) return -EINVAL; /* * Try to fix it by moving the start up. */ if (align_off + temp <= orig_off) { align_alen -= temp; align_off += temp; } /* * Try to fix it by moving the end in. */ else if (align_off + align_alen - temp >= orig_end) align_alen -= temp; /* * Set the start to the minimum then trim the length. */ else { align_alen -= orig_off - align_off; align_off = orig_off; align_alen -= xfs_extlen_to_rtxmod(mp, align_alen); } /* * Result doesn't cover the request, fail it. */ if (orig_off < align_off || orig_end > align_off + align_alen) return -EINVAL; } else { ASSERT(orig_off >= align_off); /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG if (!eof && gotp->br_startoff != NULLFILEOFF) ASSERT(align_off + align_alen <= gotp->br_startoff); if (prevp->br_startoff != NULLFILEOFF) ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount); #endif *lenp = align_alen; *offp = align_off; return 0; } static inline bool xfs_bmap_adjacent_valid( struct xfs_bmalloca *ap, xfs_fsblock_t x, xfs_fsblock_t y) { struct xfs_mount *mp = ap->ip->i_mount; if (XFS_IS_REALTIME_INODE(ap->ip) && (ap->datatype & XFS_ALLOC_USERDATA)) { if (!xfs_has_rtgroups(mp)) return x < mp->m_sb.sb_rblocks; return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) && xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount && xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents; } return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks; } #define XFS_ALLOC_GAP_UNITS 4 /* returns true if ap->blkno was modified */ bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_fsblock_t adjust; /* adjustment to block numbers */ /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && !isnullstartblock(ap->prev.br_startblock) && xfs_bmap_adjacent_valid(ap, ap->prev.br_startblock + ap->prev.br_blockcount, ap->prev.br_startblock)) { ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ adjust = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && xfs_bmap_adjacent_valid(ap, ap->blkno + adjust, ap->prev.br_startblock)) ap->blkno += adjust; return true; } /* * If not at eof, then compare the two neighbor blocks. * Figure out whether either one gives us a good starting point, * and pick the better one. */ if (!ap->eof) { xfs_fsblock_t gotbno; /* right side block number */ xfs_fsblock_t gotdiff=0; /* right side difference */ xfs_fsblock_t prevbno; /* left side block number */ xfs_fsblock_t prevdiff=0; /* left side difference */ /* * If there's a previous (left) block, select a requested * start block based on it. */ if (ap->prev.br_startoff != NULLFILEOFF && !isnullstartblock(ap->prev.br_startblock) && (prevbno = ap->prev.br_startblock + ap->prev.br_blockcount) && xfs_bmap_adjacent_valid(ap, prevbno, ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ adjust = prevdiff = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); /* * Figure the startblock based on the previous block's * end and the gap size. * Heuristic! * If the gap is large relative to the piece we're * allocating, or using it gives us an invalid block * number, then just use the end of the previous block. */ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && xfs_bmap_adjacent_valid(ap, prevbno + prevdiff, ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; } /* * No previous block or can't follow it, just default. */ else prevbno = NULLFSBLOCK; /* * If there's a following (right) block, select a requested * start block based on it. */ if (!isnullstartblock(ap->got.br_startblock)) { /* * Calculate gap to start of next block. */ adjust = gotdiff = ap->got.br_startoff - ap->offset; /* * Figure the startblock based on the next block's * start and the gap size. */ gotbno = ap->got.br_startblock; /* * Heuristic! * If the gap is large relative to the piece we're * allocating, or using it gives us an invalid block * number, then just use the start of the next block * offset by our length. */ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && xfs_bmap_adjacent_valid(ap, gotbno - gotdiff, gotbno)) gotbno -= adjust; else if (xfs_bmap_adjacent_valid(ap, gotbno - ap->length, gotbno)) { gotbno -= ap->length; gotdiff += adjust - ap->length; } else gotdiff += adjust; } /* * No next block, just default. */ else gotbno = NULLFSBLOCK; /* * If both valid, pick the better one, else the only good * one, else ap->blkno is already set (to 0 or the inode block). */ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) { ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; return true; } if (prevbno != NULLFSBLOCK) { ap->blkno = prevbno; return true; } if (gotbno != NULLFSBLOCK) { ap->blkno = gotbno; return true; } } return false; } int xfs_bmap_longest_free_extent( struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t *blen) { xfs_extlen_t longest; int error = 0; if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, NULL); if (error) return error; } longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(pag_mount(pag), pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; return 0; } static xfs_extlen_t xfs_bmap_select_minlen( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_extlen_t blen) { /* * Since we used XFS_ALLOC_FLAG_TRYLOCK in _longest_free_extent(), it is * possible that there is enough contiguous free space for this request. */ if (blen < ap->minlen) return ap->minlen; /* * If the best seen length is less than the request length, * use the best as the minimum, otherwise we've got the maxlen we * were asked for. */ if (blen < args->maxlen) return blen; return args->maxlen; } static int xfs_bmap_btalloc_select_lengths( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_extlen_t *blen) { struct xfs_mount *mp = args->mp; struct xfs_perag *pag; xfs_agnumber_t agno, startag; int error = 0; if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { args->total = ap->minlen; args->minlen = ap->minlen; return 0; } args->total = ap->total; startag = XFS_FSB_TO_AGNO(mp, ap->blkno); if (startag == NULLAGNUMBER) startag = 0; *blen = 0; for_each_perag_wrap(mp, startag, agno, pag) { error = xfs_bmap_longest_free_extent(pag, args->tp, blen); if (error && error != -EAGAIN) break; error = 0; if (*blen >= args->maxlen) break; } if (pag) xfs_perag_rele(pag); args->minlen = xfs_bmap_select_minlen(ap, args, *blen); return error; } /* Update all inode and quota accounting for the allocation we just did. */ void xfs_bmap_alloc_account( struct xfs_bmalloca *ap) { bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && !(ap->flags & XFS_BMAPI_ATTRFORK); uint fld; if (ap->flags & XFS_BMAPI_COWFORK) { /* * COW fork blocks are in-core only and thus are treated as * in-core quota reservation (like delalloc blocks) even when * converted to real blocks. The quota reservation is not * accounted to disk until blocks are remapped to the data * fork. So if these blocks were previously delalloc, we * already have quota reservation and there's nothing to do * yet. */ if (ap->wasdel) { xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); return; } /* * Otherwise, we've allocated blocks in a hole. The transaction * has acquired in-core quota reservation for this extent. * Rather than account these as real blocks, however, we reduce * the transaction quota reservation based on the allocation. * This essentially transfers the transaction quota reservation * to that of a delalloc extent. */ ap->ip->i_delayed_blks += ap->length; xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ? XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, -(long)ap->length); return; } /* data/attr fork only */ ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { ap->ip->i_delayed_blks -= ap->length; xfs_mod_delalloc(ap->ip, -(int64_t)ap->length, 0); fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; } else { fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; } xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length); } static int xfs_bmap_compute_alignments( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args) { struct xfs_mount *mp = args->mp; xfs_extlen_t align = 0; /* minimum allocation alignment */ int stripe_align = 0; /* stripe alignment for allocation is determined by mount parameters */ if (mp->m_swidth && xfs_has_swalloc(mp)) stripe_align = mp->m_swidth; else if (mp->m_dalign) stripe_align = mp->m_dalign; if (ap->flags & XFS_BMAPI_COWFORK) align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, &ap->length)) ASSERT(0); ASSERT(ap->length); } /* apply extent size hints if obtained earlier */ if (align) { args->prod = align; div_u64_rem(ap->offset, args->prod, &args->mod); if (args->mod) args->mod = args->prod - args->mod; } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { args->prod = 1; args->mod = 0; } else { args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; div_u64_rem(ap->offset, args->prod, &args->mod); if (args->mod) args->mod = args->prod - args->mod; } return stripe_align; } static void xfs_bmap_process_allocated_extent( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_fileoff_t orig_offset, xfs_extlen_t orig_length) { ap->blkno = args->fsbno; ap->length = args->len; /* * If the extent size hint is active, we tried to round the * caller's allocation request offset down to extsz and the * length up to another extsz boundary. If we found a free * extent we mapped it in starting at this new offset. If the * newly mapped space isn't long enough to cover any of the * range of offsets that was originally requested, move the * mapping up so that we can fill as much of the caller's * original request as possible. Free space is apparently * very fragmented so we're unlikely to be able to satisfy the * hints anyway. */ if (ap->length <= orig_length) ap->offset = orig_offset; else if (ap->offset + ap->length < orig_offset + orig_length) ap->offset = orig_offset + orig_length - ap->length; xfs_bmap_alloc_account(ap); } static int xfs_bmap_exact_minlen_extent_alloc( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args) { if (ap->minlen != 1) { args->fsbno = NULLFSBLOCK; return 0; } args->alloc_minlen_only = 1; args->minlen = args->maxlen = ap->minlen; args->total = ap->total; /* * Unlike the longest extent available in an AG, we don't track * the length of an AG's shortest extent. * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and * hence we can afford to start traversing from the 0th AG since * we need not be concerned about a drop in performance in * "debug only" code paths. */ ap->blkno = XFS_AGB_TO_FSB(ap->ip->i_mount, 0, 0); /* * Call xfs_bmap_btalloc_low_space here as it first does a "normal" AG * iteration and then drops args->total to args->minlen, which might be * required to find an allocation for the transaction reservation when * the file system is very full. */ return xfs_bmap_btalloc_low_space(ap, args); } /* * If we are not low on available data blocks and we are allocating at * EOF, optimise allocation for contiguous file extension and/or stripe * alignment of the new extent. * * NOTE: ap->aeof is only set if the allocation length is >= the * stripe unit and the allocation offset is at the end of file. */ static int xfs_bmap_btalloc_at_eof( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, xfs_extlen_t blen, int stripe_align, bool ag_only) { struct xfs_mount *mp = args->mp; struct xfs_perag *caller_pag = args->pag; int error; /* * If there are already extents in the file, and xfs_bmap_adjacent() has * given a better blkno, try an exact EOF block allocation to extend the * file as a contiguous extent. If that fails, or it's the first * allocation in a file, just try for a stripe aligned allocation. */ if (ap->eof) { xfs_extlen_t nextminlen = 0; /* * Compute the minlen+alignment for the next case. Set slop so * that the value of minlen+alignment+slop doesn't go up between * the calls. */ args->alignment = 1; if (blen > stripe_align && blen <= args->maxlen) nextminlen = blen - stripe_align; else nextminlen = args->minlen; if (nextminlen + stripe_align > args->minlen + 1) args->minalignslop = nextminlen + stripe_align - args->minlen - 1; else args->minalignslop = 0; if (!caller_pag) args->pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ap->blkno)); error = xfs_alloc_vextent_exact_bno(args, ap->blkno); if (!caller_pag) { xfs_perag_put(args->pag); args->pag = NULL; } if (error) return error; if (args->fsbno != NULLFSBLOCK) return 0; /* * Exact allocation failed. Reset to try an aligned allocation * according to the original allocation specification. */ args->alignment = stripe_align; args->minlen = nextminlen; args->minalignslop = 0; } else { /* * Adjust minlen to try and preserve alignment if we * can't guarantee an aligned maxlen extent. */ args->alignment = stripe_align; if (blen > args->alignment && blen <= args->maxlen + args->alignment) args->minlen = blen - args->alignment; args->minalignslop = 0; } if (ag_only) { error = xfs_alloc_vextent_near_bno(args, ap->blkno); } else { args->pag = NULL; error = xfs_alloc_vextent_start_ag(args, ap->blkno); ASSERT(args->pag == NULL); args->pag = caller_pag; } if (error) return error; if (args->fsbno != NULLFSBLOCK) return 0; /* * Allocation failed, so turn return the allocation args to their * original non-aligned state so the caller can proceed on allocation * failure as if this function was never called. */ args->alignment = 1; return 0; } /* * We have failed multiple allocation attempts so now are in a low space * allocation situation. Try a locality first full filesystem minimum length * allocation whilst still maintaining necessary total block reservation * requirements. * * If that fails, we are now critically low on space, so perform a last resort * allocation attempt: no reserve, no locality, blocking, minimum length, full * filesystem free space scan. We also indicate to future allocations in this * transaction that we are critically low on space so they don't waste time on * allocation modes that are unlikely to succeed. */ int xfs_bmap_btalloc_low_space( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args) { int error; if (args->minlen > ap->minlen) { args->minlen = ap->minlen; error = xfs_alloc_vextent_start_ag(args, ap->blkno); if (error || args->fsbno != NULLFSBLOCK) return error; } /* Last ditch attempt before failure is declared. */ args->total = ap->minlen; error = xfs_alloc_vextent_first_ag(args, 0); if (error) return error; ap->tp->t_flags |= XFS_TRANS_LOWMODE; return 0; } static int xfs_bmap_btalloc_filestreams( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, int stripe_align) { xfs_extlen_t blen = 0; int error = 0; error = xfs_filestream_select_ag(ap, args, &blen); if (error) return error; ASSERT(args->pag); /* * If we are in low space mode, then optimal allocation will fail so * prepare for minimal allocation and jump to the low space algorithm * immediately. */ if (ap->tp->t_flags & XFS_TRANS_LOWMODE) { args->minlen = ap->minlen; ASSERT(args->fsbno == NULLFSBLOCK); goto out_low_space; } args->minlen = xfs_bmap_select_minlen(ap, args, blen); if (ap->aeof) error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align, true); if (!error && args->fsbno == NULLFSBLOCK) error = xfs_alloc_vextent_near_bno(args, ap->blkno); out_low_space: /* * We are now done with the perag reference for the filestreams * association provided by xfs_filestream_select_ag(). Release it now as * we've either succeeded, had a fatal error or we are out of space and * need to do a full filesystem scan for free space which will take it's * own references. */ xfs_perag_rele(args->pag); args->pag = NULL; if (error || args->fsbno != NULLFSBLOCK) return error; return xfs_bmap_btalloc_low_space(ap, args); } static int xfs_bmap_btalloc_best_length( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, int stripe_align) { xfs_extlen_t blen = 0; int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); if (!xfs_bmap_adjacent(ap)) ap->eof = false; /* * Search for an allocation group with a single extent large enough for * the request. If one isn't found, then adjust the minimum allocation * size to the largest space found. */ error = xfs_bmap_btalloc_select_lengths(ap, args, &blen); if (error) return error; /* * Don't attempt optimal EOF allocation if previous allocations barely * succeeded due to being near ENOSPC. It is highly unlikely we'll get * optimal or even aligned allocations in this case, so don't waste time * trying. */ if (ap->aeof && !(ap->tp->t_flags & XFS_TRANS_LOWMODE)) { error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align, false); if (error || args->fsbno != NULLFSBLOCK) return error; } error = xfs_alloc_vextent_start_ag(args, ap->blkno); if (error || args->fsbno != NULLFSBLOCK) return error; return xfs_bmap_btalloc_low_space(ap, args); } static int xfs_bmap_btalloc( struct xfs_bmalloca *ap) { struct xfs_mount *mp = ap->ip->i_mount; struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp, .fsbno = NULLFSBLOCK, .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, .minleft = ap->minleft, .wasdel = ap->wasdel, .resv = XFS_AG_RESV_NONE, .datatype = ap->datatype, .alignment = 1, .minalignslop = 0, }; xfs_fileoff_t orig_offset; xfs_extlen_t orig_length; int error; int stripe_align; ASSERT(ap->length); orig_offset = ap->offset; orig_length = ap->length; stripe_align = xfs_bmap_compute_alignments(ap, &args); /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); if (unlikely(XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); else error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); if (error) return error; if (args.fsbno != NULLFSBLOCK) { xfs_bmap_process_allocated_extent(ap, &args, orig_offset, orig_length); } else { ap->blkno = NULLFSBLOCK; ap->length = 0; } return 0; } /* Trim extent to fit a logical block range. */ void xfs_trim_extent( struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len) { xfs_fileoff_t distance; xfs_fileoff_t end = bno + len; if (irec->br_startoff + irec->br_blockcount <= bno || irec->br_startoff >= end) { irec->br_blockcount = 0; return; } if (irec->br_startoff < bno) { distance = bno - irec->br_startoff; if (isnullstartblock(irec->br_startblock)) irec->br_startblock = DELAYSTARTBLOCK; if (irec->br_startblock != DELAYSTARTBLOCK && irec->br_startblock != HOLESTARTBLOCK) irec->br_startblock += distance; irec->br_startoff += distance; irec->br_blockcount -= distance; } if (end < irec->br_startoff + irec->br_blockcount) { distance = irec->br_startoff + irec->br_blockcount - end; irec->br_blockcount -= distance; } } /* * Trim the returned map to the required bounds */ STATIC void xfs_bmapi_trim_map( struct xfs_bmbt_irec *mval, struct xfs_bmbt_irec *got, xfs_fileoff_t *bno, xfs_filblks_t len, xfs_fileoff_t obno, xfs_fileoff_t end, int n, uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { *mval = *got; if (isnullstartblock(got->br_startblock)) mval->br_startblock = DELAYSTARTBLOCK; return; } if (obno > *bno) *bno = obno; ASSERT((*bno >= obno) || (n == 0)); ASSERT(*bno < end); mval->br_startoff = *bno; if (isnullstartblock(got->br_startblock)) mval->br_startblock = DELAYSTARTBLOCK; else mval->br_startblock = got->br_startblock + (*bno - got->br_startoff); /* * Return the minimum of what we got and what we asked for for * the length. We can use the len variable here because it is * modified below and we could have been there before coming * here if the first part of the allocation didn't overlap what * was asked for. */ mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, got->br_blockcount - (*bno - got->br_startoff)); mval->br_state = got->br_state; ASSERT(mval->br_blockcount <= len); return; } /* * Update and validate the extent map to return */ STATIC void xfs_bmapi_update_map( struct xfs_bmbt_irec **map, xfs_fileoff_t *bno, xfs_filblks_t *len, xfs_fileoff_t obno, xfs_fileoff_t end, int *n, uint32_t flags) { xfs_bmbt_irec_t *mval = *map; ASSERT((flags & XFS_BMAPI_ENTIRE) || ((mval->br_startoff + mval->br_blockcount) <= end)); ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || (mval->br_startoff < obno)); *bno = mval->br_startoff + mval->br_blockcount; *len = end - *bno; if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { /* update previous map with new information */ ASSERT(mval->br_startblock == mval[-1].br_startblock); ASSERT(mval->br_blockcount > mval[-1].br_blockcount); ASSERT(mval->br_state == mval[-1].br_state); mval[-1].br_blockcount = mval->br_blockcount; mval[-1].br_state = mval->br_state; } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && mval[-1].br_startblock != DELAYSTARTBLOCK && mval[-1].br_startblock != HOLESTARTBLOCK && mval->br_startblock == mval[-1].br_startblock + mval[-1].br_blockcount && mval[-1].br_state == mval->br_state) { ASSERT(mval->br_startoff == mval[-1].br_startoff + mval[-1].br_blockcount); mval[-1].br_blockcount += mval->br_blockcount; } else if (*n > 0 && mval->br_startblock == DELAYSTARTBLOCK && mval[-1].br_startblock == DELAYSTARTBLOCK && mval->br_startoff == mval[-1].br_startoff + mval[-1].br_blockcount) { mval[-1].br_blockcount += mval->br_blockcount; mval[-1].br_state = mval->br_state; } else if (!((*n == 0) && ((mval->br_startoff + mval->br_blockcount) <= obno))) { mval++; (*n)++; } *map = mval; } /* * Map file blocks to filesystem blocks without allocation. */ int xfs_bmapi_read( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, uint32_t flags) { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; struct xfs_iext_cursor icur; int error; bool eof = false; int n = 0; ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (WARN_ON_ONCE(!ifp)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); error = xfs_iread_extents(NULL, ip, whichfork); if (error) return error; if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) eof = true; end = bno + len; obno = bno; while (bno < end && n < *nmap) { /* Reading past eof, act as though there's a hole up to end. */ if (eof) got.br_startoff = end; if (got.br_startoff > bno) { /* Reading in a hole. */ mval->br_startoff = bno; mval->br_startblock = HOLESTARTBLOCK; mval->br_blockcount = XFS_FILBLKS_MIN(len, got.br_startoff - bno); mval->br_state = XFS_EXT_NORM; bno += mval->br_blockcount; len -= mval->br_blockcount; mval++; n++; continue; } /* set up the extent map to return. */ xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); /* If we're done, stop now. */ if (bno >= end || n >= *nmap) break; /* Else go on to the next record. */ if (!xfs_iext_next_extent(ifp, &icur, &got)) eof = true; } *nmap = n; return 0; } /* * Add a delayed allocation extent to an inode. Blocks are reserved from the * global pool and the extent inserted into the inode in-core extent tree. * * On entry, got refers to the first extent beyond the offset of the extent to * allocate or eof is specified if no such extent exists. On return, got refers * to the extent record that was inserted to the inode fork. * * Note that the allocated extent may have been merged with contiguous extents * during insertion into the inode fork. Thus, got does not reflect the current * state of the inode fork on return. If necessary, the caller can use lastx to * look up the updated record in the inode fork. */ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *icur, int eof) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; uint64_t fdblocks; int error; xfs_fileoff_t aoff; bool use_cowextszhint = whichfork == XFS_COW_FORK && !prealloc; retry: /* * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ aoff = off; alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) prealloc = alen - len; /* * If we're targetting the COW fork but aren't creating a speculative * posteof preallocation, try to expand the reservation to align with * the COW extent size hint if there's sufficient free space. * * Unlike the data fork, the CoW cancellation functions will free all * the reservations at inactivation, so we don't require that every * delalloc reservation have a dirty pagecache. */ if (use_cowextszhint) { struct xfs_bmbt_irec prev; xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) prev.br_startoff = NULLFILEOFF; error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, 1, 0, &aoff, &alen); ASSERT(!error); } /* * Make a transaction-less quota reservation for delayed allocation * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ error = xfs_quota_reserve_blkres(ip, alen); if (error) goto out; /* * Split changing sb for alen and indlen since they could be coming * from different places. */ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); ASSERT(indlen > 0); fdblocks = indlen; if (XFS_IS_REALTIME_INODE(ip)) { error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); if (error) goto out_unreserve_quota; } else { fdblocks += alen; } error = xfs_dec_fdblocks(mp, fdblocks, false); if (error) goto out_unreserve_frextents; ip->i_delayed_blks += alen; xfs_mod_delalloc(ip, alen, indlen); got->br_startoff = aoff; got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); /* * Tag the inode if blocks were preallocated. Note that COW fork * preallocation can occur at the start or end of the extent, even when * prealloc == 0, so we must also check the aligned offset and length. */ if (whichfork == XFS_DATA_FORK && prealloc) xfs_inode_set_eofblocks_tag(ip); if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) xfs_inode_set_cowblocks_tag(ip); return 0; out_unreserve_frextents: if (XFS_IS_REALTIME_INODE(ip)) xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); out: if (error == -ENOSPC || error == -EDQUOT) { trace_xfs_delalloc_enospc(ip, off, len); if (prealloc || use_cowextszhint) { /* retry without any preallocation */ use_cowextszhint = false; prealloc = 0; goto retry; } } return error; } static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int error; ASSERT(bma->length > 0); ASSERT(bma->length <= XFS_MAX_BMBT_EXTLEN); if (bma->flags & XFS_BMAPI_CONTIG) bma->minlen = bma->length; else bma->minlen = 1; if (!(bma->flags & XFS_BMAPI_METADATA)) { /* * For the data and COW fork, the first data in the file is * treated differently to all other allocations. For the * attribute fork, we only need to ensure the allocated range * is not on the busy list. */ bma->datatype = XFS_ALLOC_NOBUSY; if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { bma->datatype |= XFS_ALLOC_USERDATA; if (bma->offset == 0) bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; if (mp->m_dalign && bma->length >= mp->m_dalign) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } } } if ((bma->datatype & XFS_ALLOC_USERDATA) && XFS_IS_REALTIME_INODE(bma->ip)) error = xfs_bmap_rtalloc(bma); else error = xfs_bmap_btalloc(bma); if (error) return error; if (bma->blkno == NULLFSBLOCK) return -ENOSPC; if (WARN_ON_ONCE(!xfs_valid_startblock(bma->ip, bma->blkno))) { xfs_bmap_mark_sick(bma->ip, whichfork); return -EFSCORRUPTED; } if (bma->flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); if (error) return error; } if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); /* * Bump the number of extents we've allocated * in this call. */ bma->nallocs++; if (bma->cur && bma->wasdel) bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; bma->got.br_blockcount = bma->length; bma->got.br_state = XFS_EXT_NORM; if (bma->flags & XFS_BMAPI_PREALLOC) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, &bma->logflags, bma->flags); if (error) return error; /* * Update our extent pointer, given that xfs_bmap_add_extent_delay_real * or xfs_bmap_add_extent_hole_real might have merged it into one of * the neighbouring ones. */ xfs_iext_get_extent(ifp, &bma->icur, &bma->got); ASSERT(bma->got.br_startoff <= bma->offset); ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= bma->offset + bma->length); ASSERT(bma->got.br_state == XFS_EXT_NORM || bma->got.br_state == XFS_EXT_UNWRITTEN); return 0; } STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; /* check if we need to do unwritten->real conversion */ if (mval->br_state == XFS_EXT_UNWRITTEN && (flags & XFS_BMAPI_PREALLOC)) return 0; /* check if we need to do real->unwritten conversion */ if (mval->br_state == XFS_EXT_NORM && (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; /* * Modify (by adding) the state flag, if writing. */ ASSERT(mval->br_blockcount <= len); if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) { bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); } mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; /* * Before insertion into the bmbt, zero the range being converted * if required. */ if (flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, mval->br_startblock, mval->br_blockcount); if (error) return error; } error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, mval, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion * path because the conversion might not have done so (e.g., if the * extent count hasn't changed). We need to make sure the inode is dirty * in the transaction for the sake of fsync(), even if nothing has * changed, because fsync() will not force the log for this transaction * unless it sees the inode pinned. * * Note: If we're only converting cow fork extents, there aren't * any on-disk updates to make, so we don't need to log anything. */ if (whichfork != XFS_COW_FORK) bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; /* * Update our extent pointer, given that * xfs_bmap_add_extent_unwritten_real might have merged it into one * of the neighbouring ones. */ xfs_iext_get_extent(ifp, &bma->icur, &bma->got); /* * We may have combined previously unwritten space with written space, * so generate another request. */ if (mval->br_blockcount < len) return -EAGAIN; return 0; } xfs_extlen_t xfs_bmapi_minleft( struct xfs_trans *tp, struct xfs_inode *ip, int fork) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); if (tp && tp->t_highest_agno != NULLAGNUMBER) return 0; if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; return be16_to_cpu(ifp->if_broot->bb_level) + 1; } /* * Log whatever the flags say, even if error. Otherwise we might miss detecting * a case where the data is changed, there's an error, and it's not logged so we * don't shutdown when we should. Don't bother logging extents/btree changes if * we converted to the other format. */ static void xfs_bmapi_finish( struct xfs_bmalloca *bma, int whichfork, int error) { struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); if ((bma->logflags & xfs_ilog_fext(whichfork)) && ifp->if_format != XFS_DINODE_FMT_EXTENTS) bma->logflags &= ~xfs_ilog_fext(whichfork); else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && ifp->if_format != XFS_DINODE_FMT_BTREE) bma->logflags &= ~xfs_ilog_fbroot(whichfork); if (bma->logflags) xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); if (bma->cur) xfs_btree_del_cursor(bma->cur, error); } /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags * parameter. Only allocates blocks from a single allocation group, to avoid * locking problems. * * Returns 0 on success and places the extent mappings in mval. nmaps is used * as an input/output parameter where the caller specifies the maximum number * of mappings that may be returned and xfs_bmapi_write passes back the number * of mappings (including existing mappings) it found. * * Returns a negative error code on failure, including -ENOSPC when it could not * allocate any blocks and -ENOSR when it did allocate blocks to convert a * delalloc range, but those blocks were before the passed in range. */ int xfs_bmapi_write( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ { struct xfs_bmalloca bma = { .tp = tp, .ip = ip, .total = total, }; struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ int orig_flags; /* original flags arg value */ xfs_filblks_t orig_len; /* original value of len arg */ struct xfs_bmbt_irec *orig_mval; /* original value of mval */ int orig_nmap; /* original value of *nmap */ orig_bno = bno; orig_len = len; orig_flags = flags; orig_mval = mval; orig_nmap = *nmap; #endif ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)); /* * we can allocate unwritten extents or pre-zero allocated blocks, * but it makes no sense to do both at once. This would result in * zeroing the unwritten extent twice, but it still being an * unwritten extent.... */ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapw); error = xfs_iread_extents(tp, ip, whichfork); if (error) goto error0; if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); n = 0; end = bno + len; obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; /* in hole or beyond EOF? */ if (eof || bma.got.br_startoff > bno) { /* * CoW fork conversions should /never/ hit EOF or * holes. There should always be something for us * to work on. */ ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } /* * First, deal with the hole before the allocated space * that we found, if any. */ if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; bma.flags = flags; /* * There's a 32/64 bit type mismatch between the * allocation length request (which can be 64 bits in * length) and the bma length request, which is * xfs_extlen_t and therefore 32 bits. Hence we have to * be careful and do the min() using the larger type to * avoid overflows. */ bma.length = XFS_FILBLKS_MIN(len, XFS_MAX_BMBT_EXTLEN); if (wasdelay) { bma.length = XFS_FILBLKS_MIN(bma.length, bma.got.br_blockcount - (bno - bma.got.br_startoff)); } else { if (!eof) bma.length = XFS_FILBLKS_MIN(bma.length, bma.got.br_startoff - bno); } ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma); if (error) { /* * If we already allocated space in a previous * iteration return what we go so far when * running out of space. */ if (error == -ENOSPC && bma.nallocs) break; goto error0; } /* * If this is a CoW allocation, record the data in * the refcount btree for orphan recovery. */ if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), bma.blkno, bma.length); } /* Deal with the allocated space we found. */ xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, end, n, flags); /* Execute unwritten extent conversion if necessary */ error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); if (error == -EAGAIN) continue; if (error) goto error0; /* update the extent map to return */ xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); /* * If we're done, stop now. Stop when we've allocated * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise * the transaction may get too big. */ if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) break; /* Else go on to the next record. */ bma.prev = bma.got; if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); if (error) goto error0; ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE || ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, orig_nmap, n); /* * When converting delayed allocations, xfs_bmapi_allocate ignores * the passed in bno and always converts from the start of the found * delalloc extent. * * To avoid a successful return with *nmap set to 0, return the magic * -ENOSR error code for this particular case so that the caller can * handle it. */ if (!n) { ASSERT(bma.nallocs >= *nmap); return -ENOSR; } *nmap = n; return 0; error0: xfs_bmapi_finish(&bma, whichfork, error); return error; } /* * Convert an existing delalloc extent to real blocks based on file offset. This * attempts to allocate the entire delalloc extent and may require multiple * invocations to allocate the target offset if a large enough physical extent * is not available. */ static int xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; uint16_t flags = 0; struct xfs_trans *tp; int error; if (whichfork == XFS_COW_FORK) flags |= IOMAP_F_SHARED; /* * Space for the extent and indirect blocks was reserved when the * delalloc extent was created so there's no need to do so here. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_extend(tp, ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* * No extent found in the range we are trying to convert. This * should only happen for the COW fork, where another thread * might have moved the extent to the data fork in the meantime. */ WARN_ON_ONCE(whichfork != XFS_COW_FORK); error = -EAGAIN; goto out_trans_cancel; } /* * If we find a real extent here we raced with another thread converting * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); if (seq) *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } bma.tp = tp; bma.ip = ip; bma.wasdel = true; bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* * Always allocate convert from the start of the delalloc extent even if * that is outside the passed in range to create large contiguous * extents on disk. */ bma.offset = bma.got.br_startoff; bma.length = bma.got.br_blockcount; /* * When we're converting the delalloc reservations backing dirty pages * in the page cache, we must be careful about how we create the new * extents: * * New CoW fork extents are created unwritten, turned into real extents * when we're about to write the data to disk, and mapped into the data * fork after the write finishes. End of story. * * New data fork extents must be mapped in as unwritten and converted * to real extents after the write succeeds to avoid exposing stale * disk contents if we crash. */ bma.flags = XFS_BMAPI_PREALLOC; if (whichfork == XFS_COW_FORK) bma.flags |= XFS_BMAPI_COWFORK; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; error = xfs_bmapi_allocate(&bma); if (error) goto out_finish; XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); if (seq) *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), bma.blkno, bma.length); error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); if (error) goto out_finish; xfs_bmapi_finish(&bma, whichfork, 0); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_finish: xfs_bmapi_finish(&bma, whichfork, error); out_trans_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * Pass in a dellalloc extent and convert it to real extents, return the real * extent that maps offset_fsb in iomap. */ int xfs_bmapi_convert_delalloc( struct xfs_inode *ip, int whichfork, loff_t offset, struct iomap *iomap, unsigned int *seq) { int error; /* * Attempt to allocate whatever delalloc extent currently backs offset * and put the result into iomap. Allocate in a loop because it may * take several attempts to allocate real blocks for a contiguous * delalloc extent if free space is sufficiently fragmented. */ do { error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, iomap, seq); if (error) return error; } while (iomap->offset + iomap->length <= offset); return 0; } int xfs_bmapi_remap( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) { /* make sure we only reflink into a hole. */ ASSERT(got.br_startoff > bno); ASSERT(got.br_startoff - bno >= len); } ip->i_nblocks += len; ip->i_delayed_blks -= len; /* see xfs_bmap_defer_add */ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); got.br_startoff = bno; got.br_startblock = startblock; got.br_blockcount = len; if (flags & XFS_BMAPI_PREALLOC) got.br_state = XFS_EXT_UNWRITTEN; else got.br_state = XFS_EXT_NORM; error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur, &cur, &got, &logflags, flags); if (error) goto error0; error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~XFS_ILOG_DEXT; else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE) logflags &= ~XFS_ILOG_DBROOT; if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (cur) xfs_btree_del_cursor(cur, error); return error; } /* * When a delalloc extent is split (e.g., due to a hole punch), the original * indlen reservation must be shared across the two new extents that are left * behind. * * Given the original reservation and the worst case indlen for the two new * extents (as calculated by xfs_bmap_worst_indlen()), split the original * reservation fairly across the two new extents. If necessary, steal available * blocks from a deleted extent to make up a reservation deficiency (e.g., if * ores == 1). The number of stolen blocks is returned. The availability and * subsequent accounting of stolen blocks is the responsibility of the caller. */ static void xfs_bmap_split_indlen( xfs_filblks_t ores, /* original res. */ xfs_filblks_t *indlen1, /* ext1 worst indlen */ xfs_filblks_t *indlen2) /* ext2 worst indlen */ { xfs_filblks_t len1 = *indlen1; xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ xfs_filblks_t resfactor; /* * We can't meet the total required reservation for the two extents. * Calculate the percent of the overall shortage between both extents * and apply this percentage to each of the requested indlen values. * This distributes the shortage fairly and reduces the chances that one * of the two extents is left with nothing when extents are repeatedly * split. */ resfactor = (ores * 100); do_div(resfactor, nres); len1 *= resfactor; do_div(len1, 100); len2 *= resfactor; do_div(len2, 100); ASSERT(len1 + len2 <= ores); ASSERT(len1 < *indlen1 && len2 < *indlen2); /* * Hand out the remainder to each extent. If one of the two reservations * is zero, we want to make sure that one gets a block first. The loop * below starts with len1, so hand len2 a block right off the bat if it * is zero. */ ores -= (len1 + len2); ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); if (ores && !len2 && *indlen2) { len2++; ores--; } while (ores) { if (len1 < *indlen1) { len1++; ores--; } if (!ores) break; if (len2 < *indlen2) { len2++; ores--; } } *indlen1 = len1; *indlen2 = len2; } void xfs_bmap_del_extent_delay( struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen = 0; uint32_t state = xfs_bmap_fork_to_state(whichfork); uint64_t fdblocks; bool isrt; XFS_STATS_INC(mp, xs_del_exlist); isrt = xfs_ifork_is_realtime(ip, whichfork); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; da_old = startblockval(got->br_startblock); da_new = 0; ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); /* * Update the inode delalloc counter now and wait to update the * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ xfs_quota_unreserve_blkres(ip, del->br_blockcount); ip->i_delayed_blks -= del->br_blockcount; if (got->br_startoff == del->br_startoff) state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) state |= BMAP_RIGHT_FILLING; switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); break; case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); xfs_iext_update_extent(ip, state, icur, got); break; case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ got->br_blockcount = got->br_blockcount - del->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); xfs_iext_update_extent(ip, state, icur, got); break; case 0: /* * Deleting the middle of the extent. * * Distribute the original indlen reservation across the two new * extents. Steal blocks from the deleted extent if necessary. * Stealing blocks simply fudges the fdblocks accounting below. * Warn if either of the new indlen reservations is zero as this * can lead to delalloc problems. */ got->br_blockcount = del->br_startoff - got->br_startoff; got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount); new.br_blockcount = got_endoff - del_endoff; new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); WARN_ON_ONCE(!got_indlen || !new_indlen); /* * Steal as many blocks as we can to try and satisfy the worst * case indlen for both new extents. * * However, we can't just steal reservations from the data * blocks if this is an RT inodes as the data and metadata * blocks come from different pools. We'll have to live with * under-filled indirect reservation in this case. */ da_new = got_indlen + new_indlen; if (da_new > da_old && !isrt) { stolen = XFS_FILBLKS_MIN(da_new - da_old, del->br_blockcount); da_old += stolen; } if (da_new > da_old) xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen); da_new = got_indlen + new_indlen; got->br_startblock = nullstartblock((int)got_indlen); new.br_startoff = del_endoff; new.br_state = got->br_state; new.br_startblock = nullstartblock((int)new_indlen); xfs_iext_update_extent(ip, state, icur, got); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); del->br_blockcount -= stolen; break; } ASSERT(da_old >= da_new); da_diff = da_old - da_new; fdblocks = da_diff; if (isrt) xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); else fdblocks += del->br_blockcount; xfs_add_fdblocks(mp, fdblocks); xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); } void xfs_bmap_del_extent_cow( struct xfs_inode *ip, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got->br_startblock)); if (got->br_startoff == del->br_startoff) state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) state |= BMAP_RIGHT_FILLING; switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); break; case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; xfs_iext_update_extent(ip, state, icur, got); break; case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ got->br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, got); break; case 0: /* * Deleting the middle of the extent. */ got->br_blockcount = del->br_startoff - got->br_startoff; new.br_startoff = del_endoff; new.br_blockcount = got_endoff - del_endoff; new.br_state = got->br_state; new.br_startblock = del->br_startblock + del->br_blockcount; xfs_iext_update_extent(ip, state, icur, got); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); break; } ip->i_delayed_blks -= del->br_blockcount; } static int xfs_bmap_free_rtblocks( struct xfs_trans *tp, struct xfs_bmbt_irec *del) { struct xfs_rtgroup *rtg; int error; rtg = xfs_rtgroup_grab(tp->t_mountp, 0); if (!rtg) return -EIO; /* * Ensure the bitmap and summary inodes are locked and joined to the * transaction before modifying them. */ if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP); xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP); } error = xfs_rtfree_blocks(tp, rtg, del->br_startblock, del->br_blockcount); xfs_rtgroup_rele(rtg); return error; } /* * Called by xfs_bmapi to update file extent records and the btree * after removing space. */ STATIC int /* error */ xfs_bmap_del_extent_real( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ struct xfs_iext_cursor *icur, struct xfs_btree_cur *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ int error = 0; /* error return value */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ struct xfs_ifork *ifp; /* inode fork pointer */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t nblks; /* quota/sb block count */ xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; *logflagsp = 0; mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(del->br_blockcount > 0); xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got.br_startoff + got.br_blockcount; ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; /* * If it's the case where the directory code is running with no block * reservation, and the deleted block is in the middle of its extent, * and the resulting insert of an extent would cause transformation to * btree format, then reject it. The calling code will then swap blocks * around instead. We have to do this now, rather than waiting for the * conversion to btree format, since the transaction will be dirty then. */ if (tp->t_blk_res == 0 && ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) && del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; *logflagsp = XFS_ILOG_CORE; if (xfs_ifork_is_realtime(ip, whichfork)) qfield = XFS_TRANS_DQ_RTBCOUNT; else qfield = XFS_TRANS_DQ_BCOUNT; nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; if (cur) { error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } } if (got.br_startoff == del->br_startoff) state |= BMAP_LEFT_FILLING; if (got_endoff == del_endoff) state |= BMAP_RIGHT_FILLING; switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) { case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Matches the whole extent. Delete the entry. */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); ifp->if_nextents--; *logflagsp |= XFS_ILOG_CORE; if (!cur) { *logflagsp |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } break; case BMAP_LEFT_FILLING: /* * Deleting the first part of the extent. */ got.br_startoff = del_endoff; got.br_startblock = del_endblock; got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) return error; break; case BMAP_RIGHT_FILLING: /* * Deleting the last part of the extent. */ got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) return error; break; case 0: /* * Deleting the middle of the extent. */ old = got; got.br_blockcount = del->br_startoff - got.br_startoff; xfs_iext_update_extent(ip, state, icur, &got); new.br_startoff = del_endoff; new.br_blockcount = got_endoff - del_endoff; new.br_state = got.br_state; new.br_startblock = del_endblock; *logflagsp |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) return error; error = xfs_btree_increment(cur, 0, &i); if (error) return error; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); if (error && error != -ENOSPC) return error; /* * If get no-space back from btree insert, it tried a * split, and we have a zero block reservation. Fix up * our state and return the error. */ if (error == -ENOSPC) { /* * Reset the cursor, don't trust it after any * insert operation. */ error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } /* * Update the btree record back * to the original value. */ error = xfs_bmbt_update(cur, &old); if (error) return error; /* * Reset the extent record back * to the original value. */ xfs_iext_update_extent(ip, state, icur, &old); *logflagsp = 0; return -ENOSPC; } if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } } else *logflagsp |= xfs_ilog_fext(whichfork); ifp->if_nextents++; xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); break; } /* remove reverse mapping */ xfs_rmap_unmap_extent(tp, ip, whichfork, del); /* * If we need to, add to list of extents to delete. */ if (!(bflags & XFS_BMAPI_REMAP)) { bool isrt = xfs_ifork_is_realtime(ip, whichfork); if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, isrt, del); } else if (isrt && !xfs_has_rtgroups(mp)) { error = xfs_bmap_free_rtblocks(tp, del); } else { unsigned int efi_flags = 0; if ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN) efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD; /* * Historically, we did not use EFIs to free realtime * extents. However, when reverse mapping is enabled, * we must maintain the same order of operations as the * data device, which is: Remove the file mapping, * remove the reverse mapping, and then free the * blocks. Reflink for realtime volumes requires the * same sort of ordering. Both features rely on * rtgroups, so let's gate rt EFI usage on rtgroups. */ if (isrt) efi_flags |= XFS_FREE_EXTENT_REALTIME; error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, efi_flags); } if (error) return error; } /* * Adjust inode # blocks in the file. */ if (nblks) ip->i_nblocks -= nblks; /* * Adjust quota data. */ if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); return 0; } /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then * *done is set. */ static int __xfs_bunmapi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ struct xfs_bmbt_irec del; /* extent being deleted */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ struct xfs_bmbt_irec got; /* current extent record */ struct xfs_ifork *ifp; /* inode fork pointer */ int isrt; /* freeing in rt area */ int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ struct xfs_mount *mp = ip->i_mount; int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; trace_xfs_bunmap(ip, start, len, flags, _RET_IP_); whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = xfs_ifork_ptr(ip, whichfork); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(len > 0); ASSERT(nexts >= 0); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; } XFS_STATS_INC(mp, xs_blk_unmap); isrt = xfs_ifork_is_realtime(ip, whichfork); end = start + len; if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { *rlen = 0; return 0; } end--; logflags = 0; if (ifp->if_format == XFS_DINODE_FMT_BTREE) { ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); } else cur = NULL; extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. */ if (got.br_startoff > end && !xfs_iext_prev_extent(ifp, &icur, &got)) { done = true; break; } /* * Is the last block of this extent before the range * we're supposed to delete? If so, we're done. */ end = XFS_FILEOFF_MIN(end, got.br_startoff + got.br_blockcount - 1); if (end < start) break; /* * Then deal with the (possibly delayed) allocated space * we found. */ del = got; wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; if (!wasdel) del.br_startblock += start - got.br_startoff; } if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; if (!isrt || (flags & XFS_BMAPI_REMAP)) goto delete; mod = xfs_rtb_to_rtxoff(mp, del.br_startblock + del.br_blockcount); if (mod) { /* * Realtime extent not lined up at the end. * The extent could have been split into written * and unwritten pieces, or we could just be * unmapping part of it. But we can't really * get rid of part of a realtime extent. */ if (del.br_state == XFS_EXT_UNWRITTEN) { /* * This piece is unwritten, or we're not * using unwritten extents. Skip over it. */ ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod); end -= mod > del.br_blockcount ? del.br_blockcount : mod; if (end < got.br_startoff && !xfs_iext_prev_extent(ifp, &icur, &got)) { done = true; break; } continue; } /* * It's written, turn it unwritten. * This is better than zeroing it. */ ASSERT(del.br_state == XFS_EXT_NORM); ASSERT(tp->t_blk_res > 0); /* * If this spans a realtime extent boundary, * chop it back to the start of the one we end at. */ if (del.br_blockcount > mod) { del.br_startoff += del.br_blockcount - mod; del.br_startblock += del.br_blockcount - mod; del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, whichfork, &icur, &cur, &del, &logflags); if (error) goto error0; goto nodelete; } mod = xfs_rtb_to_rtxoff(mp, del.br_startblock); if (mod) { xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; /* * Realtime extent is lined up at the end but not * at the front. We'll get rid of full extents if * we can. */ if (del.br_blockcount > off) { del.br_blockcount -= off; del.br_startoff += off; del.br_startblock += off; } else if (del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || tp->t_blk_res == 0)) { /* * Can't make it unwritten. There isn't * a full extent here so just skip it. */ ASSERT(end >= del.br_blockcount); end -= del.br_blockcount; if (got.br_startoff > end && !xfs_iext_prev_extent(ifp, &icur, &got)) { done = true; break; } continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { struct xfs_bmbt_irec prev; xfs_fileoff_t unwrite_start; /* * This one is already unwritten. * It must have a written left neighbor. * Unwrite the killed part of that one and * try again. */ if (!xfs_iext_prev_extent(ifp, &icur, &prev)) ASSERT(0); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); unwrite_start = max3(start, del.br_startoff - mod, prev.br_startoff); mod = unwrite_start - prev.br_startoff; prev.br_startoff = unwrite_start; prev.br_startblock += mod; prev.br_blockcount -= mod; prev.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, whichfork, &icur, &cur, &prev, &logflags); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, whichfork, &icur, &cur, &del, &logflags); if (error) goto error0; goto nodelete; } } delete: if (wasdel) { xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; if (error) goto error0; } end = del.br_startoff - 1; nodelete: /* * If not done go on to the next (previous) record. */ if (end != (xfs_fileoff_t)-1 && end >= start) { if (!xfs_iext_get_extent(ifp, &icur, &got) || (got.br_startoff > end && !xfs_iext_prev_extent(ifp, &icur, &got))) { done = true; break; } extno++; } } if (done || end == (xfs_fileoff_t)-1 || end < start) *rlen = 0; else *rlen = end - start + 1; /* * Convert to a btree if necessary. */ if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; } else { error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); } error0: /* * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ if ((logflags & xfs_ilog_fext(whichfork)) && ifp->if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~xfs_ilog_fext(whichfork); else if ((logflags & xfs_ilog_fbroot(whichfork)) && ifp->if_format != XFS_DINODE_FMT_BTREE) logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction * is dirty we'll need to shut down the filesystem. */ if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; } /* Unmap a range of a file. */ int xfs_bunmapi( xfs_trans_t *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done) { int error; error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts); *done = (len == 0); return error; } /* * Determine whether an extent shift can be accomplished by a merge with the * extent that precedes the target hole of the shift. */ STATIC bool xfs_bmse_can_merge( struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_bmbt_irec *got, /* current extent to shift */ xfs_fileoff_t shift) /* shift fsb */ { xfs_fileoff_t startoff; startoff = got->br_startoff - shift; /* * The extent, once shifted, must be adjacent in-file and on-disk with * the preceding extent. */ if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) || !xfs_bmap_same_rtgroup(ip, whichfork, left, got)) return false; return true; } /* * A bmap extent shift adjusts the file offset of an extent to fill a preceding * hole in the file. If an extent shift would result in the extent being fully * adjacent to the extent that currently precedes the hole, we can merge with * the preceding extent rather than do the shift. * * This function assumes the caller has verified a shift-by-merge is possible * with the provided extents via xfs_bmse_can_merge(). */ STATIC int xfs_bmse_merge( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_fileoff_t shift, /* shift fsb */ struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, /* extent to shift */ struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, int *logflags) /* output */ { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; struct xfs_mount *mp = ip->i_mount; blockcount = left->br_blockcount + got->br_blockcount; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift)); new = *left; new.br_blockcount = blockcount; /* * Update the on-disk extent count, the btree if necessary and log the * inode. */ ifp->if_nextents--; *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; goto done; } /* lookup and remove the extent to merge */ error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_btree_delete(cur, &i); if (error) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_bmbt_update(cur, &new); if (error) return error; /* change to extent format if required after extent removal */ error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); if (error) return error; done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, &new); /* update reverse mapping. rmap functions merge the rmaps for us */ xfs_rmap_unmap_extent(tp, ip, whichfork, got); memcpy(&new, got, sizeof(new)); new.br_startoff = left->br_startoff + left->br_blockcount; xfs_rmap_map_extent(tp, ip, whichfork, &new); return 0; } static int xfs_bmap_shift_update_extent( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, struct xfs_btree_cur *cur, int *logflags, xfs_fileoff_t startoff) { struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec prev = *got; int error, i; *logflags |= XFS_ILOG_CORE; got->br_startoff = startoff; if (cur) { error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_bmbt_update(cur, got); if (error) return error; } else { *logflags |= XFS_ILOG_DEXT; } xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, got); /* update reverse mapping */ xfs_rmap_unmap_extent(tp, ip, whichfork, &prev); xfs_rmap_map_extent(tp, ip, whichfork, got); return 0; } int xfs_bmap_collapse_extents( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done) { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, prev; struct xfs_iext_cursor icur; xfs_fileoff_t new_startoff; int error = 0; int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { *done = true; goto del_cursor; } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } new_startoff = got.br_startoff - offset_shift_fsb; if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) { if (new_startoff < prev.br_startoff + prev.br_blockcount) { error = -EINVAL; goto del_cursor; } if (xfs_bmse_can_merge(ip, whichfork, &prev, &got, offset_shift_fsb)) { error = xfs_bmse_merge(tp, ip, whichfork, offset_shift_fsb, &icur, &got, &prev, cur, &logflags); if (error) goto del_cursor; goto done; } } else { if (got.br_startoff < offset_shift_fsb) { error = -EINVAL; goto del_cursor; } } error = xfs_bmap_shift_update_extent(tp, ip, whichfork, &icur, &got, cur, &logflags, new_startoff); if (error) goto del_cursor; done: if (!xfs_iext_next_extent(ifp, &icur, &got)) { *done = true; goto del_cursor; } *next_fsb = got.br_startoff; del_cursor: if (cur) xfs_btree_del_cursor(cur, error); if (logflags) xfs_trans_log_inode(tp, ip, logflags); return error; } /* Make sure we won't be right-shifting an extent past the maximum bound. */ int xfs_bmap_can_insert_extents( struct xfs_inode *ip, xfs_fileoff_t off, xfs_fileoff_t shift) { struct xfs_bmbt_irec got; int is_empty; int error = 0; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); if (xfs_is_shutdown(ip->i_mount)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty); if (!error && !is_empty && got.br_startoff >= off && ((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff) error = -EINVAL; xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } int xfs_bmap_insert_extents( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb) { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, next; struct xfs_iext_cursor icur; xfs_fileoff_t new_startoff; int error = 0; int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); if (*next_fsb == NULLFSBLOCK) { xfs_iext_last(ifp, &icur); if (!xfs_iext_get_extent(ifp, &icur, &got) || stop_fsb > got.br_startoff) { *done = true; goto del_cursor; } } else { if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { *done = true; goto del_cursor; } } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } new_startoff = got.br_startoff + offset_shift_fsb; if (xfs_iext_peek_next_extent(ifp, &icur, &next)) { if (new_startoff + got.br_blockcount > next.br_startoff) { error = -EINVAL; goto del_cursor; } /* * Unlike a left shift (which involves a hole punch), a right * shift does not modify extent neighbors in any way. We should * never find mergeable extents in this scenario. Check anyways * and warn if we encounter two extents that could be one. */ if (xfs_bmse_can_merge(ip, whichfork, &got, &next, offset_shift_fsb)) WARN_ON_ONCE(1); } error = xfs_bmap_shift_update_extent(tp, ip, whichfork, &icur, &got, cur, &logflags, new_startoff); if (error) goto del_cursor; if (!xfs_iext_prev_extent(ifp, &icur, &got) || stop_fsb >= got.br_startoff + got.br_blockcount) { *done = true; goto del_cursor; } *next_fsb = got.br_startoff; del_cursor: if (cur) xfs_btree_del_cursor(cur, error); if (logflags) xfs_trans_log_inode(tp, ip, logflags); return error; } /* * Splits an extent into two extents at split_fsb block such that it is the * first block of the current_ext. @ext is a target extent to be split. * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ int xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; xfs_fsblock_t gotblkcnt; /* new block count for got */ struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } if (xfs_is_shutdown(mp)) return -EIO; /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; /* * If there are not extents, or split_fsb lies in a hole we are done. */ if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) || got.br_startoff >= split_fsb) return 0; gotblkcnt = split_fsb - got.br_startoff; new.br_startoff = split_fsb; new.br_startblock = got.br_startblock + gotblkcnt; new.br_blockcount = got.br_blockcount - gotblkcnt; new.br_state = got.br_state; if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } } got.br_blockcount = gotblkcnt; xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur, &got); logflags = XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) goto del_cursor; } else logflags |= XFS_ILOG_DEXT; /* Add new extent */ xfs_iext_next(ifp, &icur); xfs_iext_insert(ip, &icur, &new, 0); ifp->if_nextents++; if (cur) { error = xfs_bmbt_lookup_eq(cur, &new, &i); if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 0)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } error = xfs_btree_insert(cur, &i); if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } } /* * Convert to a btree if necessary. */ if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; } del_cursor: if (cur) { cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } if (logflags) xfs_trans_log_inode(tp, ip, logflags); return error; } /* Record a bmap intent. */ static inline void __xfs_bmap_add( struct xfs_trans *tp, enum xfs_bmap_intent_type type, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *bmap) { struct xfs_bmap_intent *bi; if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) || bmap->br_startblock == HOLESTARTBLOCK || bmap->br_startblock == DELAYSTARTBLOCK) return; bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; xfs_bmap_defer_add(tp, bi); } /* Map an extent into a file. */ void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *PREV) { __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV); } /* Unmap an extent out of a file. */ void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *PREV) { __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV); } /* * Process one of the deferred bmap operations. We pass back the * btree cursor to maintain our lock on the bmapbt between calls. */ int xfs_bmap_finish_one( struct xfs_trans *tp, struct xfs_bmap_intent *bi) { struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; int flags = 0; if (bi->bi_whichfork == XFS_ATTR_FORK) flags |= XFS_BMAPI_ATTRFORK; ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_bmap_deferred(bi); if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { case XFS_BMAP_MAP: if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN) flags |= XFS_BMAPI_PREALLOC; error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, bmap->br_blockcount, bmap->br_startblock, flags); bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, &bmap->br_blockcount, flags | XFS_BMAPI_REMAP, 1); break; default: ASSERT(0); xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); error = -EFSCORRUPTED; } return error; } /* Check that an extent does not have invalid flags or bad ranges. */ xfs_failaddr_t xfs_bmap_validate_extent_raw( struct xfs_mount *mp, bool rtfile, int whichfork, struct xfs_bmbt_irec *irec) { if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) return __this_address; if (rtfile && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) return __this_address; } else { if (!xfs_verify_fsbext(mp, irec->br_startblock, irec->br_blockcount)) return __this_address; } if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK) return __this_address; return NULL; } int __init xfs_bmap_intent_init_cache(void) { xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent", sizeof(struct xfs_bmap_intent), 0, 0, NULL); return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM; } void xfs_bmap_intent_destroy_cache(void) { kmem_cache_destroy(xfs_bmap_intent_cache); xfs_bmap_intent_cache = NULL; } /* Check that an inode's extent does not have invalid flags or bad ranges. */ xfs_failaddr_t xfs_bmap_validate_extent( struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec) { return xfs_bmap_validate_extent_raw(ip->i_mount, XFS_IS_REALTIME_INODE(ip), whichfork, irec); } /* * Used in xfs_itruncate_extents(). This is the maximum number of extents * freed from a file in a single transaction. */ #define XFS_ITRUNC_MAX_EXTENTS 2 /* * Unmap every extent in part of an inode's fork. We don't do any higher level * invalidation work at all. */ int xfs_bunmapi_range( struct xfs_trans **tpp, struct xfs_inode *ip, uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff) { xfs_filblks_t unmap_len = endoff - startoff + 1; int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); while (unmap_len > 0) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; /* free the just unmapped extents */ error = xfs_defer_finish(tpp); if (error) goto out; cond_resched(); } out: return error; } struct xfs_bmap_query_range { xfs_bmap_query_range_fn fn; void *priv; }; /* Format btree record and pass to our callback. */ STATIC int xfs_bmap_query_range_helper( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { struct xfs_bmap_query_range *query = priv; struct xfs_bmbt_irec irec; xfs_failaddr_t fa; xfs_bmbt_disk_get_all(&rec->bmbt, &irec); fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork, &irec); if (fa) { xfs_btree_mark_sick(cur); return xfs_bmap_complain_bad_rec(cur->bc_ino.ip, cur->bc_ino.whichfork, fa, &irec); } return query->fn(cur, &irec, query->priv); } /* Find all bmaps. */ int xfs_bmap_query_all( struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, void *priv) { struct xfs_bmap_query_range query = { .priv = priv, .fn = fn, }; return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query); } /* Helper function to extract extent size hint from inode */ xfs_extlen_t xfs_get_extsz_hint( struct xfs_inode *ip) { /* * No point in aligning allocations if we need to COW to actually * write to them. */ if (!xfs_is_always_cow_inode(ip) && (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) return ip->i_extsize; if (XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1) return ip->i_mount->m_sb.sb_rextsize; return 0; } /* * Helper function to extract CoW extent size hint from inode. * Between the extent size hint and the CoW extent size hint, we * return the greater of the two. If the value is zero (automatic), * use the default size. */ xfs_extlen_t xfs_get_cowextsz_hint( struct xfs_inode *ip) { xfs_extlen_t a, b; a = 0; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) a = ip->i_cowextsize; if (XFS_IS_REALTIME_INODE(ip)) { b = 0; if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) b = ip->i_extsize; } else { b = xfs_get_extsz_hint(ip); } a = max(a, b); if (a == 0) return XFS_DEFAULT_COWEXTSZ_HINT; return a; }
3854 3853 984 4 283 161 22 3625 130 3275 712 3854 458 3627 3967 3965 3672 3975 3969 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/mount.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <uapi/linux/mount.h> #include "common.h" /* String table for special mount operations. */ static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = { [TOMOYO_MOUNT_BIND] = "--bind", [TOMOYO_MOUNT_MOVE] = "--move", [TOMOYO_MOUNT_REMOUNT] = "--remount", [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable", [TOMOYO_MOUNT_MAKE_PRIVATE] = "--make-private", [TOMOYO_MOUNT_MAKE_SLAVE] = "--make-slave", [TOMOYO_MOUNT_MAKE_SHARED] = "--make-shared", }; /** * tomoyo_audit_mount_log - Audit mount log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mount_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n", r->param.mount.dev->name, r->param.mount.dir->name, r->param.mount.type->name, r->param.mount.flags); } /** * tomoyo_check_mount_acl - Check permission for path path path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mount_acl *acl = container_of(ptr, typeof(*acl), head); return tomoyo_compare_number_union(r->param.mount.flags, &acl->flags) && tomoyo_compare_name_union(r->param.mount.type, &acl->fs_type) && tomoyo_compare_name_union(r->param.mount.dir, &acl->dir_name) && (!r->param.mount.need_dev || tomoyo_compare_name_union(r->param.mount.dev, &acl->dev_name)); } /** * tomoyo_mount_acl - Check permission for mount() operation. * * @r: Pointer to "struct tomoyo_request_info". * @dev_name: Name of device file. Maybe NULL. * @dir: Pointer to "struct path". * @type: Name of filesystem type. * @flags: Mount options. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_mount_acl(struct tomoyo_request_info *r, const char *dev_name, const struct path *dir, const char *type, unsigned long flags) { struct tomoyo_obj_info obj = { }; struct path path; struct file_system_type *fstype = NULL; const char *requested_type = NULL; const char *requested_dir_name = NULL; const char *requested_dev_name = NULL; struct tomoyo_path_info rtype; struct tomoyo_path_info rdev; struct tomoyo_path_info rdir; int need_dev = 0; int error = -ENOMEM; r->obj = &obj; /* Get fstype. */ requested_type = tomoyo_encode(type); if (!requested_type) goto out; rtype.name = requested_type; tomoyo_fill_path_info(&rtype); /* Get mount point. */ obj.path2 = *dir; requested_dir_name = tomoyo_realpath_from_path(dir); if (!requested_dir_name) { error = -ENOMEM; goto out; } rdir.name = requested_dir_name; tomoyo_fill_path_info(&rdir); /* Compare fs name. */ if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] || type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) { need_dev = -1; /* dev_name is a directory */ } else { fstype = get_fs_type(type); if (!fstype) { error = -ENODEV; goto out; } if (fstype->fs_flags & FS_REQUIRES_DEV) /* dev_name is a block device file. */ need_dev = 1; } if (need_dev) { /* Get mount point or device file. */ if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) { error = -ENOENT; goto out; } obj.path1 = path; requested_dev_name = tomoyo_realpath_from_path(&path); if (!requested_dev_name) { error = -ENOENT; goto out; } } else { /* Map dev_name to "<NULL>" if no dev_name given. */ if (!dev_name) dev_name = "<NULL>"; requested_dev_name = tomoyo_encode(dev_name); if (!requested_dev_name) { error = -ENOMEM; goto out; } } rdev.name = requested_dev_name; tomoyo_fill_path_info(&rdev); r->param_type = TOMOYO_TYPE_MOUNT_ACL; r->param.mount.need_dev = need_dev; r->param.mount.dev = &rdev; r->param.mount.dir = &rdir; r->param.mount.type = &rtype; r->param.mount.flags = flags; do { tomoyo_check_acl(r, tomoyo_check_mount_acl); error = tomoyo_audit_mount_log(r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(requested_dev_name); kfree(requested_dir_name); if (fstype) put_filesystem(fstype); kfree(requested_type); /* Drop refcount obtained by kern_path(). */ if (obj.path1.dentry) path_put(&obj.path1); return error; } /** * tomoyo_mount_permission - Check permission for mount() operation. * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data_page: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page) { struct tomoyo_request_info r; int error; int idx; if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT) == TOMOYO_CONFIG_DISABLED) return 0; if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; if (flags & MS_REMOUNT) { type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]; flags &= ~MS_REMOUNT; } else if (flags & MS_BIND) { type = tomoyo_mounts[TOMOYO_MOUNT_BIND]; flags &= ~MS_BIND; } else if (flags & MS_SHARED) { if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]; flags &= ~MS_SHARED; } else if (flags & MS_PRIVATE) { if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE]; flags &= ~MS_PRIVATE; } else if (flags & MS_SLAVE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE]; flags &= ~MS_SLAVE; } else if (flags & MS_UNBINDABLE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE]; flags &= ~MS_UNBINDABLE; } else if (flags & MS_MOVE) { type = tomoyo_mounts[TOMOYO_MOUNT_MOVE]; flags &= ~MS_MOVE; } if (!type) type = "<NULL>"; idx = tomoyo_read_lock(); error = tomoyo_mount_acl(&r, dev_name, path, type, flags); tomoyo_read_unlock(idx); return error; }
51 49 51 51 1 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 // SPDX-License-Identifier: GPL-2.0 /* * linux/drivers/char/misc.c * * Generic misc open routine by Johan Myreen * * Based on code from Linus * * Teemu Rantanen's Microsoft Busmouse support and Derrick Cole's * changes incorporated into 0.97pl4 * by Peter Cervasio (pete%q106fm.uucp@wupost.wustl.edu) (08SEP92) * See busmouse.c for particulars. * * Made things a lot mode modular - easy to compile in just one or two * of the misc drivers, as they are now completely independent. Linus. * * Support for loadable modules. 8-Sep-95 Philip Blundell <pjb27@cam.ac.uk> * * Fixed a failing symbol register to free the device registration * Alan Cox <alan@lxorguk.ukuu.org.uk> 21-Jan-96 * * Dynamic minors and /proc/mice by Alessandro Rubini. 26-Mar-96 * * Renamed to misc and miscdevice to be more accurate. Alan Cox 26-Mar-96 * * Handling of mouse minor numbers for kerneld: * Idea by Jacques Gelinas <jack@solucorp.qc.ca>, * adapted by Bjorn Ekwall <bj0rn@blox.se> * corrected by Alan Cox <alan@lxorguk.ukuu.org.uk> * * Changes for kmod (from kerneld): * Cyrus Durgin <cider@speakeasy.org> * * Added devfs support. Richard Gooch <rgooch@atnf.csiro.au> 10-Jan-1998 */ #include <linux/module.h> #include <linux/fs.h> #include <linux/errno.h> #include <linux/miscdevice.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/device.h> #include <linux/tty.h> #include <linux/kmod.h> #include <linux/gfp.h> /* * Head entry for the doubly linked miscdevice list */ static LIST_HEAD(misc_list); static DEFINE_MUTEX(misc_mtx); /* * Assigned numbers, used for dynamic minors */ #define DYNAMIC_MINORS 128 /* like dynamic majors */ static DEFINE_IDA(misc_minors_ida); static int misc_minor_alloc(int minor) { int ret = 0; if (minor == MISC_DYNAMIC_MINOR) { /* allocate free id */ ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL); if (ret >= 0) { ret = DYNAMIC_MINORS - ret - 1; } else { ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1, MINORMASK, GFP_KERNEL); } } else { /* specific minor, check if it is in dynamic or misc dynamic range */ if (minor < DYNAMIC_MINORS) { minor = DYNAMIC_MINORS - minor - 1; ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL); } else if (minor > MISC_DYNAMIC_MINOR) { ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL); } else { /* case of non-dynamic minors, no need to allocate id */ ret = 0; } } return ret; } static void misc_minor_free(int minor) { if (minor < DYNAMIC_MINORS) ida_free(&misc_minors_ida, DYNAMIC_MINORS - minor - 1); else if (minor > MISC_DYNAMIC_MINOR) ida_free(&misc_minors_ida, minor); } #ifdef CONFIG_PROC_FS static void *misc_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&misc_mtx); return seq_list_start(&misc_list, *pos); } static void *misc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &misc_list, pos); } static void misc_seq_stop(struct seq_file *seq, void *v) { mutex_unlock(&misc_mtx); } static int misc_seq_show(struct seq_file *seq, void *v) { const struct miscdevice *p = list_entry(v, struct miscdevice, list); seq_printf(seq, "%3i %s\n", p->minor, p->name ? p->name : ""); return 0; } static const struct seq_operations misc_seq_ops = { .start = misc_seq_start, .next = misc_seq_next, .stop = misc_seq_stop, .show = misc_seq_show, }; #endif static int misc_open(struct inode *inode, struct file *file) { int minor = iminor(inode); struct miscdevice *c = NULL, *iter; int err = -ENODEV; const struct file_operations *new_fops = NULL; mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } if (!new_fops) { mutex_unlock(&misc_mtx); request_module("char-major-%d-%d", MISC_MAJOR, minor); mutex_lock(&misc_mtx); list_for_each_entry(iter, &misc_list, list) { if (iter->minor != minor) continue; c = iter; new_fops = fops_get(iter->fops); break; } if (!new_fops) goto fail; } /* * Place the miscdevice in the file's * private_data so it can be used by the * file operations, including f_op->open below */ file->private_data = c; err = 0; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode, file); fail: mutex_unlock(&misc_mtx); return err; } static char *misc_devnode(const struct device *dev, umode_t *mode) { const struct miscdevice *c = dev_get_drvdata(dev); if (mode && c->mode) *mode = c->mode; if (c->nodename) return kstrdup(c->nodename, GFP_KERNEL); return NULL; } static const struct class misc_class = { .name = "misc", .devnode = misc_devnode, }; static const struct file_operations misc_fops = { .owner = THIS_MODULE, .open = misc_open, .llseek = noop_llseek, }; /** * misc_register - register a miscellaneous device * @misc: device structure * * Register a miscellaneous device with the kernel. If the minor * number is set to %MISC_DYNAMIC_MINOR a minor number is assigned * and placed in the minor field of the structure. For other cases * the minor number requested is used. * * The structure passed is linked into the kernel and may not be * destroyed until it has been unregistered. By default, an open() * syscall to the device sets file->private_data to point to the * structure. Drivers don't need open in fops for this. * * A zero is returned on success and a negative errno code for * failure. */ int misc_register(struct miscdevice *misc) { dev_t dev; int err = 0; bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR); INIT_LIST_HEAD(&misc->list); mutex_lock(&misc_mtx); if (is_dynamic) { int i = misc_minor_alloc(misc->minor); if (i < 0) { err = -EBUSY; goto out; } misc->minor = i; } else { struct miscdevice *c; int i; list_for_each_entry(c, &misc_list, list) { if (c->minor == misc->minor) { err = -EBUSY; goto out; } } i = misc_minor_alloc(misc->minor); if (i < 0) { err = -EBUSY; goto out; } } dev = MKDEV(MISC_MAJOR, misc->minor); misc->this_device = device_create_with_groups(&misc_class, misc->parent, dev, misc, misc->groups, "%s", misc->name); if (IS_ERR(misc->this_device)) { if (is_dynamic) { misc_minor_free(misc->minor); misc->minor = MISC_DYNAMIC_MINOR; } err = PTR_ERR(misc->this_device); goto out; } /* * Add it to the front, so that later devices can "override" * earlier defaults */ list_add(&misc->list, &misc_list); out: mutex_unlock(&misc_mtx); return err; } EXPORT_SYMBOL(misc_register); /** * misc_deregister - unregister a miscellaneous device * @misc: device to unregister * * Unregister a miscellaneous device that was previously * successfully registered with misc_register(). */ void misc_deregister(struct miscdevice *misc) { if (WARN_ON(list_empty(&misc->list))) return; mutex_lock(&misc_mtx); list_del(&misc->list); device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor)); misc_minor_free(misc->minor); mutex_unlock(&misc_mtx); } EXPORT_SYMBOL(misc_deregister); static int __init misc_init(void) { int err; struct proc_dir_entry *ret; ret = proc_create_seq("misc", 0, NULL, &misc_seq_ops); err = class_register(&misc_class); if (err) goto fail_remove; err = -EIO; if (register_chrdev(MISC_MAJOR, "misc", &misc_fops)) goto fail_printk; return 0; fail_printk: pr_err("unable to get major %d for misc devices\n", MISC_MAJOR); class_unregister(&misc_class); fail_remove: if (ret) remove_proc_entry("misc", NULL); return err; } subsys_initcall(misc_init);
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0 #include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/syscore_ops.h> #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> #include <linux/pgtable.h> #include <linux/atomic.h> #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/apic.h> #include <asm/i8259.h> /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. * plus some generic x86 specific things if generic specifics makes * any sense at all. */ static void init_8259A(int auto_eoi); static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); /* * 8259A PIC functions to handle ISA devices: */ /* * This contains the irq mask for both 8259A irq controllers, */ unsigned int cached_irq_mask = 0xffff; /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) * boards the timer interrupt is not really connected to any IO-APIC pin, * it's fed to the master 8259A's IR0 line only. * * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. * this 'mixed mode' IRQ handling costs nothing because it's only used * at IRQ setup time. */ unsigned long io_apic_irqs; static void mask_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static void disable_8259A_irq(struct irq_data *data) { mask_8259A_irq(data->irq); } static void unmask_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static void enable_8259A_irq(struct irq_data *data) { unmask_8259A_irq(data->irq); } static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<<irq; unsigned long flags; int ret; raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); irq_set_status_flags(irq, IRQ_LEVEL); enable_irq(irq); lapic_assign_legacy_vector(irq, true); } /* * This function assumes to be called rarely. Switching between * 8259A registers is slow. * This has to be protected by the irq controller spinlock * before being called. */ static inline int i8259A_irq_real(unsigned int irq) { int value; int irqmask = 1<<irq; if (irq < 8) { outb(0x0B, PIC_MASTER_CMD); /* ISR register */ value = inb(PIC_MASTER_CMD) & irqmask; outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */ return value; } outb(0x0B, PIC_SLAVE_CMD); /* ISR register */ value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ return value; } /* * Careful! The 8259A is a fragile beast, it pretty * much _has_ to be done exactly like this (mask it * first, _then_ send the EOI, and the order of EOI * to the two 8259s is important! */ static void mask_and_ack_8259A(struct irq_data *data) { unsigned int irq = data->irq; unsigned int irqmask = 1 << irq; unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign * of hardware problems, so we only do the checks we can * do without slowing down good hardware unnecessarily. * * Note that IRQ7 and IRQ15 (the two spurious IRQs * usually resulting from the 8259A-1|2 PICs) occur * even if the IRQ is masked in the 8259A. Thus we * can check spurious 8259A IRQs without doing the * quite slow i8259A_irq_real() call for every IRQ. * This does not cover 100% of spurious interrupts, * but should be enough to warn the user that there * is something bad going on ... */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; cached_irq_mask |= irqmask; handle_real_irq: if (irq & 8) { inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* 'Specific EOI' to slave */ outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to master-IRQ2 */ outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); } else { inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: /* * this is the slow path - should happen rarely. */ if (i8259A_irq_real(irq)) /* * oops, the IRQ _is_ in service according to the * 8259A - not spurious, go handle it. */ goto handle_real_irq; { static int spurious_irq_mask; /* * At this point we can be sure the IRQ is spurious, * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); /* * Theoretically we do not have to handle this IRQ, * but in Linux this does not cause problems and is * simpler for us. */ goto handle_real_irq; } } struct irq_chip i8259A_chip = { .name = "XT-PIC", .irq_mask = disable_8259A_irq, .irq_disable = disable_8259A_irq, .irq_unmask = enable_8259A_irq, .irq_mask_ack = mask_and_ack_8259A, }; static char irq_trigger[2]; /* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ static void restore_ELCR(char *trigger) { outb(trigger[0], PIC_ELCR1); outb(trigger[1], PIC_ELCR2); } static void save_ELCR(char *trigger) { /* IRQ 0,1,2,8,13 are marked as reserved */ trigger[0] = inb(PIC_ELCR1) & 0xF8; trigger[1] = inb(PIC_ELCR2) & 0xDE; } static void i8259A_resume(void) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); } static int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } static void i8259A_shutdown(void) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it * out of. */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ } static struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; static void mask_8259A(void) { unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static void unmask_8259A(void) { unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static int probe_8259A(void) { unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; /* * If MADT has the PCAT_COMPAT flag set, then do not bother probing * for the PIC. Some BIOSes leave the PIC uninitialized and probing * fails. * * Right now this causes problems as quite some code depends on * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly * when the system has an IO/APIC because then PIC is not required * at all, except for really old machines where the timer interrupt * must be routed through the PIC. So just pretend that the PIC is * there and let legacy_pic->init() initialize it for nothing. * * Alternatively this could just try to initialize the PIC and * repeat the probe, but for cases where there is no PIC that's * just pointless. */ if (pcat_compat) return nr_legacy_irqs(); /* * Check to see if we have a PIC. Mask all except the cascade and * read back the value we just wrote. If we don't have a PIC, we * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; } raw_spin_unlock_irqrestore(&i8259A_lock, flags); return nr_legacy_irqs(); } static void init_8259A(int auto_eoi) { unsigned long flags; i8259A_auto_eoi = auto_eoi; raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* * outb_pic - this has to work on a wide range of PC hardware. */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); if (auto_eoi) /* master does Auto EOI */ outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt * when acking. */ i8259A_chip.irq_mask_ack = disable_8259A_irq; else i8259A_chip.irq_mask_ack = mask_and_ack_8259A; udelay(100); /* wait for 8259A to initialize */ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ raw_spin_unlock_irqrestore(&i8259A_lock, flags); } /* * make i8259 a driver so that we can select pic functions at run time. the goal * is to make x86 binary compatible among pc compatible and non-pc compatible * platforms, such as x86 MID. */ static void legacy_pic_noop(void) { }; static void legacy_pic_uint_noop(unsigned int unused) { }; static void legacy_pic_int_noop(int unused) { }; static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } static int legacy_pic_probe(void) { return 0; } struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, .chip = &dummy_irq_chip, .mask = legacy_pic_uint_noop, .unmask = legacy_pic_uint_noop, .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; static struct legacy_pic default_legacy_pic = { .nr_legacy_irqs = NR_IRQS_LEGACY, .chip = &i8259A_chip, .mask = mask_8259A_irq, .unmask = unmask_8259A_irq, .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; struct legacy_pic *legacy_pic = &default_legacy_pic; EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { if (legacy_pic == &default_legacy_pic) register_syscore_ops(&i8259_syscore_ops); return 0; } device_initcall(i8259A_init_ops); void __init legacy_pic_pcat_compat(void) { pcat_compat = true; }
163 163 128 128 128 18 128 128 29 11 18 11 18 177 40 163 163 54 55 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log * entry for a block beyond the last revoke, then that log entry still * gets replayed. * * We can get interactions between revokes and new log data within a * single transaction: * * Block is revoked and then journaled: * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: * The revoke must take precedence over the write of the block, so we * need either to cancel the journal entry or to write the revoke * later in the log than the log block. In this case, we choose the * latter: journaling a block cancels any revoke record for that block * in the current transaction, so any revoke for that block in the * transaction must have happened after the block was journaled and so * the revoke must take precedence. * * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * * We cache revoke status of a buffer in the current transaction in b_states * bits. As the name says, revokevalid flag indicates that the cached revoke * status of a buffer is valid and we can rely on the cached status. * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up * RevokeValid set, Revoked clear: * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. * * Locking rules: * We keep two hash tables of revoke records. One hashtable belongs to the * running transaction (is pointed to by journal->j_revoke), the other one * belongs to the committing transaction. Accesses to the second hash table * happen only from the kjournald and no other thread touches this table. Also * journal_switch_revoke_table() which switches which hashtable belongs to the * running and which to the committing transaction is called only from * kjournald. Therefore we need no locks when accessing the hashtable belonging * to the committing transaction. * * All users operating on the hash table belonging to the running transaction * have a handle to the transaction. Therefore they are safe from kjournald * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ #ifndef __KERNEL__ #include "jfs_user.h" #else #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ struct jbd2_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; /* The revoke table is just a simple hash table of revoke records. */ struct jbd2_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ int hash_size; int hash_shift; struct list_head *hash_table; }; #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, struct jbd2_revoke_record_s *); static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ static inline int hash(journal_t *journal, unsigned long long block) { return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) gfp_mask |= __GFP_NOFAIL; record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; } /* Find a revoke record in the journal's hash table. */ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); record = (struct jbd2_revoke_record_s *) hash_list->next; while (&(record->hash) != hash_list) { if (record->blocknr == blocknr) { spin_unlock(&journal->j_revoke_lock); return record; } record = (struct jbd2_revoke_record_s *) record->hash.next; } spin_unlock(&journal->j_revoke_lock); return NULL; } void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; } void jbd2_journal_destroy_revoke_table_cache(void) { kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) { pr_emerg("JBD2: failed to create revoke_record cache\n"); return -ENOMEM; } return 0; } int __init jbd2_journal_init_revoke_table_cache(void) { J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) { pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; } return 0; } static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; struct jbd2_revoke_table_s *table; table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); if (!table) goto out; while((tmp >>= 1UL) != 0UL) shift++; table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; goto out; } for (tmp = 0; tmp < hash_size; tmp++) INIT_LIST_HEAD(&table->hash_table[tmp]); out: return table; } static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; for (i = 0; i < table->hash_size; i++) { hash_list = &table->hash_table[i]; J_ASSERT(list_empty(hash_list)); } kfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } /* Initialise the revoke table for a given journal to a given size. */ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) { J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[0]) goto fail0; journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[1]) goto fail1; journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } /* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; if (journal->j_revoke_table[0]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); if (journal->j_revoke_table[1]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ /* * jbd2_journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting * metadata. * * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. * * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count * by one. */ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; journal_t *journal; struct block_device *bdev; int err; might_sleep(); if (bh_in) BUFFER_TRACE(bh_in, "enter"); journal = handle->h_transaction->t_journal; if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ J_ASSERT (!"Cannot set revoke feature!"); return -EINVAL; } bdev = journal->j_fs_dev; bh = bh_in; if (!bh) { bh = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } #ifdef JBD2_EXPENSIVE_CHECKING else { struct buffer_head *bh2; /* If there is a different buffer_head lying around in * memory anywhere... */ bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) /* ...then it better be revoked too, * since it's illegal to create a revoke * record against a buffer_head which is * not marked revoked --- that would * risk missing a subsequent revoke * cancel. */ J_ASSERT_BH(bh2, buffer_revoked(bh2)); put_bh(bh2); } } #endif if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { if (!bh_in) brelse(bh); return -EIO; } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ if (bh) { if (!J_EXPECT_BH(bh, !buffer_revoked(bh), "inconsistent data on disk")) { if (!bh_in) brelse(bh); return -EIO; } set_buffer_revoked(bh); set_buffer_revokevalid(bh); if (bh_in) { BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); jbd2_journal_forget(handle, bh_in); } else { BUFFER_TRACE(bh, "call brelse"); __brelse(bh); } } handle->h_revoke_credits--; jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); return err; } /* * Cancel an outstanding revoke. For use only internally by the * journaling code (called from jbd2_journal_get_write_access). * * We trust buffer_revoked() on the buffer if the buffer is already * being journaled: if there is no revoke pending on the buffer, then we * don't do anything here. * * This would break if it were possible for a buffer to be revoked and * discarded, and then reallocated within the same transaction. In such * a case we would have lost the revoked bit, but when we arrived here * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If * not, we can't trust the revoke bit, and we need to do the * full search for a revoke record. */ if (test_set_buffer_revokevalid(bh)) { need_cancel = test_clear_buffer_revoked(bh); } else { need_cancel = 1; clear_buffer_revoked(bh); } if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); did_revoke = 1; } } #ifdef JBD2_EXPENSIVE_CHECKING /* There better not be one left behind by now! */ record = find_revoke_record(journal, bh->b_blocknr); J_ASSERT_JH(jh, record == NULL); #endif /* Finally, have we just cleared revoke on an unhashed * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } return did_revoke; } /* * journal_clear_revoked_flag clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { struct jbd2_revoke_table_s *revoke = journal->j_revoke; int i = 0; for (i = 0; i < revoke->hash_size; i++) { struct list_head *hash_list; struct list_head *list_entry; hash_list = &revoke->hash_table[i]; list_for_each(list_entry, hash_list) { struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; bh = __find_get_block(journal->j_fs_dev, record->blocknr, journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); } } } } /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { int i; if (journal->j_revoke == journal->j_revoke_table[0]) journal->j_revoke = journal->j_revoke_table[1]; else journal->j_revoke = journal->j_revoke_table[0]; for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs) { journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; int i, offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ revoke = journal->j_revoke == journal->j_revoke_table[0] ? journal->j_revoke_table[1] : journal->j_revoke_table[0]; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) flush_descriptor(journal, descriptor, offset); jbd2_debug(1, "Wrote %d revoke records\n", count); } /* * Write out one revoke record. We need to create a new descriptor * block if the old one is full or if we have not already created one. */ static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record) { journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in jbd2_journal_write_revoke_records in order to free all of the revoke records: only the IO to the journal is omitted. */ if (is_journal_aborted(journal)) return; descriptor = *descriptorp; offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { descriptor = jbd2_journal_get_descriptor_buffer(transaction, JBD2_REVOKE_BLOCK); if (!descriptor) return; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += sz; *offsetp = offset; } /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, int offset) { jbd2_journal_revoke_header_t *header; if (is_journal_aborted(journal)) return; header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif /* * Revoke support for recovery. * * Recovery needs to be able to: * * record all revoke records, including the tid of the latest instance * of each revoke in the journal * * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) * * empty the revoke table after recovery. */ /* * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a * single block. */ int jbd2_journal_set_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (record) { /* If we have multiple occurrences, only record the * latest sequence number in the hashed record */ if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; } return insert_revoke_hash(journal, blocknr, sequence); } /* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ int jbd2_journal_test_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (!record) return 0; if (tid_gt(sequence, record->sequence)) return 0; return 1; } /* * Finally, once recovery is over, we need to clear the revoke table so * that it can be reused by the running filesystem. */ void jbd2_journal_clear_revoke(journal_t *journal) { int i; struct list_head *hash_list; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; revoke = journal->j_revoke; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s*) hash_list->next; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } }
3 3 1 5 5 2 2 105 104 105 1 1 1 1 1 1 105 104 1 104 104 1 103 1 1 4 4 68 190 3 3 2 1 3 2 1 2 1 2 1 1 1 1 1 2 2 2 3 3 3 5 5 2 3 2 3 2 3 2 3 6 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_update.h" #include "errcode.h" #include "error.h" #include "inode.h" #include "quota.h" #include "snapshot.h" #include "super-io.h" static const char * const bch2_quota_types[] = { "user", "group", "project", }; static const char * const bch2_quota_counters[] = { "space", "inodes", }; static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); if (vstruct_bytes(&q->field) < sizeof(*q)) { prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&q->field), sizeof(*q)); return -BCH_ERR_invalid_sb_quota; } return 0; } static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_quota *q = field_to_type(f, quota); unsigned qtyp, counter; for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { prt_printf(out, "%s: flags %llx", bch2_quota_types[qtyp], le64_to_cpu(q->q[qtyp].flags)); for (counter = 0; counter < Q_COUNTERS; counter++) prt_printf(out, " %s timelimit %u warnlimit %u", bch2_quota_counters[counter], le32_to_cpu(q->q[qtyp].c[counter].timelimit), le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); prt_newline(out); } } const struct bch_sb_field_ops bch_sb_field_ops_quota = { .validate = bch2_sb_quota_validate, .to_text = bch2_sb_quota_to_text, }; int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { int ret = 0; bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, quota_type_invalid, "invalid quota type (%llu >= %u)", k.k->p.inode, QTYP_NR); fsck_err: return ret; } void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); unsigned i; for (i = 0; i < Q_COUNTERS; i++) prt_printf(out, "%s hardlimit %llu softlimit %llu", bch2_quota_counters[i], le64_to_cpu(dq.v->c[i].hardlimit), le64_to_cpu(dq.v->c[i].softlimit)); } #ifdef CONFIG_BCACHEFS_QUOTA #include <linux/cred.h> #include <linux/fs.h> #include <linux/quota.h> static void qc_info_to_text(struct printbuf *out, struct qc_info *i) { printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); prt_printf(out, "i_flags\t%u\n", i->i_flags); prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); } static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) { printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); prt_printf(out, "d_space\t%llu\n", q->d_space); prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); } static inline unsigned __next_qtype(unsigned i, unsigned qtypes) { qtypes >>= i; return qtypes ? i + __ffs(qtypes) : QTYP_NR; } #define for_each_set_qtype(_c, _i, _q, _qtypes) \ for (_i = 0; \ (_i = __next_qtype(_i, _qtypes), \ _q = &(_c)->quotas[_i], \ _i < QTYP_NR); \ _i++) static bool ignore_hardlimit(struct bch_memquota_type *q) { if (capable(CAP_SYS_RESOURCE)) return true; #if 0 struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; return capable(CAP_SYS_RESOURCE) && (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & DQF_ROOT_SQUASH)); #endif return false; } enum quota_msg { SOFTWARN, /* Softlimit reached */ SOFTLONGWARN, /* Grace time expired */ HARDWARN, /* Hardlimit reached */ HARDBELOW, /* Usage got below inode hardlimit */ SOFTBELOW, /* Usage got below inode softlimit */ }; static int quota_nl[][Q_COUNTERS] = { [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, }; struct quota_msgs { u8 nr; struct { u8 qtype; u8 msg; } m[QTYP_NR * Q_COUNTERS]; }; static void prepare_msg(unsigned qtype, enum quota_counters counter, struct quota_msgs *msgs, enum quota_msg msg_type) { BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); msgs->m[msgs->nr].qtype = qtype; msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; msgs->nr++; } static void prepare_warning(struct memquota_counter *qc, unsigned qtype, enum quota_counters counter, struct quota_msgs *msgs, enum quota_msg msg_type) { if (qc->warning_issued & (1 << msg_type)) return; prepare_msg(qtype, counter, msgs, msg_type); } static void flush_warnings(struct bch_qid qid, struct super_block *sb, struct quota_msgs *msgs) { unsigned i; for (i = 0; i < msgs->nr; i++) quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), sb->s_dev, msgs->m[i].msg); } static int bch2_quota_check_limit(struct bch_fs *c, unsigned qtype, struct bch_memquota *mq, struct quota_msgs *msgs, enum quota_counters counter, s64 v, enum quota_acct_mode mode) { struct bch_memquota_type *q = &c->quotas[qtype]; struct memquota_counter *qc = &mq->c[counter]; u64 n = qc->v + v; BUG_ON((s64) n < 0); if (mode == KEY_TYPE_QUOTA_NOCHECK) return 0; if (v <= 0) { if (n < qc->hardlimit && (qc->warning_issued & (1 << HARDWARN))) { qc->warning_issued &= ~(1 << HARDWARN); prepare_msg(qtype, counter, msgs, HARDBELOW); } if (n < qc->softlimit && (qc->warning_issued & (1 << SOFTWARN))) { qc->warning_issued &= ~(1 << SOFTWARN); prepare_msg(qtype, counter, msgs, SOFTBELOW); } qc->warning_issued = 0; return 0; } if (qc->hardlimit && qc->hardlimit < n && !ignore_hardlimit(q)) { prepare_warning(qc, qtype, counter, msgs, HARDWARN); return -EDQUOT; } if (qc->softlimit && qc->softlimit < n) { if (qc->timer == 0) { qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; prepare_warning(qc, qtype, counter, msgs, SOFTWARN); } else if (ktime_get_real_seconds() >= qc->timer && !ignore_hardlimit(q)) { prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); return -EDQUOT; } } return 0; } int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, enum quota_counters counter, s64 v, enum quota_acct_mode mode) { unsigned qtypes = enabled_qtypes(c); struct bch_memquota_type *q; struct bch_memquota *mq[QTYP_NR]; struct quota_msgs msgs; unsigned i; int ret = 0; memset(&msgs, 0, sizeof(msgs)); for_each_set_qtype(c, i, q, qtypes) { mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); if (!mq[i]) return -ENOMEM; } for_each_set_qtype(c, i, q, qtypes) mutex_lock_nested(&q->lock, i); for_each_set_qtype(c, i, q, qtypes) { ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); if (ret) goto err; } for_each_set_qtype(c, i, q, qtypes) mq[i]->c[counter].v += v; err: for_each_set_qtype(c, i, q, qtypes) mutex_unlock(&q->lock); flush_warnings(qid, c->vfs_sb, &msgs); return ret; } static void __bch2_quota_transfer(struct bch_memquota *src_q, struct bch_memquota *dst_q, enum quota_counters counter, s64 v) { BUG_ON(v > src_q->c[counter].v); BUG_ON(v + dst_q->c[counter].v < v); src_q->c[counter].v -= v; dst_q->c[counter].v += v; } int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, struct bch_qid dst, struct bch_qid src, u64 space, enum quota_acct_mode mode) { struct bch_memquota_type *q; struct bch_memquota *src_q[3], *dst_q[3]; struct quota_msgs msgs; unsigned i; int ret = 0; qtypes &= enabled_qtypes(c); memset(&msgs, 0, sizeof(msgs)); for_each_set_qtype(c, i, q, qtypes) { src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); if (!src_q[i] || !dst_q[i]) return -ENOMEM; } for_each_set_qtype(c, i, q, qtypes) mutex_lock_nested(&q->lock, i); for_each_set_qtype(c, i, q, qtypes) { ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, dst_q[i]->c[Q_SPC].v + space, mode); if (ret) goto err; ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, dst_q[i]->c[Q_INO].v + 1, mode); if (ret) goto err; } for_each_set_qtype(c, i, q, qtypes) { __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); } err: for_each_set_qtype(c, i, q, qtypes) mutex_unlock(&q->lock); flush_warnings(dst, c->vfs_sb, &msgs); return ret; } static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, struct qc_dqblk *qdq) { struct bkey_s_c_quota dq; struct bch_memquota_type *q; struct bch_memquota *mq; unsigned i; BUG_ON(k.k->p.inode >= QTYP_NR); if (!((1U << k.k->p.inode) & enabled_qtypes(c))) return 0; switch (k.k->type) { case KEY_TYPE_quota: dq = bkey_s_c_to_quota(k); q = &c->quotas[k.k->p.inode]; mutex_lock(&q->lock); mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); if (!mq) { mutex_unlock(&q->lock); return -ENOMEM; } for (i = 0; i < Q_COUNTERS; i++) { mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); } if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) mq->c[Q_SPC].timer = qdq->d_spc_timer; if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) mq->c[Q_SPC].warns = qdq->d_spc_warns; if (qdq && qdq->d_fieldmask & QC_INO_TIMER) mq->c[Q_INO].timer = qdq->d_ino_timer; if (qdq && qdq->d_fieldmask & QC_INO_WARNS) mq->c[Q_INO].warns = qdq->d_ino_warns; mutex_unlock(&q->lock); } return 0; } void bch2_fs_quota_exit(struct bch_fs *c) { unsigned i; for (i = 0; i < ARRAY_SIZE(c->quotas); i++) genradix_free(&c->quotas[i].table); } void bch2_fs_quota_init(struct bch_fs *c) { unsigned i; for (i = 0; i < ARRAY_SIZE(c->quotas); i++) mutex_init(&c->quotas[i].lock); } static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) { struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota); if (sb_quota) return sb_quota; sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64)); if (sb_quota) { unsigned qtype, qc; for (qtype = 0; qtype < QTYP_NR; qtype++) for (qc = 0; qc < Q_COUNTERS; qc++) sb_quota->q[qtype].c[qc].timelimit = cpu_to_le32(7 * 24 * 60 * 60); } return sb_quota; } static void bch2_sb_quota_read(struct bch_fs *c) { struct bch_sb_field_quota *sb_quota; unsigned i, j; sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota); if (!sb_quota) return; for (i = 0; i < QTYP_NR; i++) { struct bch_memquota_type *q = &c->quotas[i]; for (j = 0; j < Q_COUNTERS; j++) { q->limits[j].timelimit = le32_to_cpu(sb_quota->q[i].c[j].timelimit); q->limits[j].warnlimit = le32_to_cpu(sb_quota->q[i].c[j].warnlimit); } } } static int bch2_fs_quota_read_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bch_inode_unpacked u; struct bch_snapshot_tree s_t; u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot); int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "%s: snapshot tree %u not found", __func__, tree); if (ret) return ret; if (!s_t.master_subvol) goto advance; ret = bch2_inode_find_by_inum_nowarn_trans(trans, (subvol_inum) { le32_to_cpu(s_t.master_subvol), k.k->p.offset, }, &u); /* * Inode might be deleted in this snapshot - the easiest way to handle * that is to just skip it here: */ if (bch2_err_matches(ret, ENOENT)) goto advance; if (ret) return ret; bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, KEY_TYPE_QUOTA_NOCHECK); bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, KEY_TYPE_QUOTA_NOCHECK); advance: bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); return 0; } int bch2_fs_quota_read(struct bch_fs *c) { mutex_lock(&c->sb_lock); struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); return -BCH_ERR_ENOSPC_sb_quota; } bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, BTREE_ITER_prefetch, k, __bch2_quota_set(c, k, NULL)) ?: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, bch2_fs_quota_read_inode(trans, &iter, k))); bch_err_fn(c, ret); return ret; } /* Enable/disable/delete quotas for an entire filesystem: */ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; int ret = 0; if (sb->s_flags & SB_RDONLY) return -EROFS; /* Accounting must be enabled at mount time: */ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) return -EINVAL; /* Can't enable enforcement without accounting: */ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) return -EINVAL; if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) return -EINVAL; if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) return -EINVAL; mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { ret = -BCH_ERR_ENOSPC_sb_quota; goto unlock; } if (uflags & FS_QUOTA_UDQ_ENFD) SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); if (uflags & FS_QUOTA_GDQ_ENFD) SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); if (uflags & FS_QUOTA_PDQ_ENFD) SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); bch2_write_super(c); unlock: mutex_unlock(&c->sb_lock); return bch2_err_class(ret); } static int bch2_quota_disable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; if (sb->s_flags & SB_RDONLY) return -EROFS; mutex_lock(&c->sb_lock); if (uflags & FS_QUOTA_UDQ_ENFD) SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); if (uflags & FS_QUOTA_GDQ_ENFD) SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); if (uflags & FS_QUOTA_PDQ_ENFD) SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); bch2_write_super(c); mutex_unlock(&c->sb_lock); return 0; } static int bch2_quota_remove(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; int ret; if (sb->s_flags & SB_RDONLY) return -EROFS; if (uflags & FS_USER_QUOTA) { if (c->opts.usrquota) return -EINVAL; ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_USR, 0), POS(QTYP_USR, U64_MAX), 0, NULL); if (ret) return ret; } if (uflags & FS_GROUP_QUOTA) { if (c->opts.grpquota) return -EINVAL; ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_GRP, 0), POS(QTYP_GRP, U64_MAX), 0, NULL); if (ret) return ret; } if (uflags & FS_PROJ_QUOTA) { if (c->opts.prjquota) return -EINVAL; ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_PRJ, 0), POS(QTYP_PRJ, U64_MAX), 0, NULL); if (ret) return ret; } return 0; } /* * Return quota status information, such as enforcements, quota file inode * numbers etc. */ static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) { struct bch_fs *c = sb->s_fs_info; unsigned qtypes = enabled_qtypes(c); unsigned i; memset(state, 0, sizeof(*state)); for (i = 0; i < QTYP_NR; i++) { state->s_state[i].flags |= QCI_SYSFILE; if (!(qtypes & (1 << i))) continue; state->s_state[i].flags |= QCI_ACCT_ENABLED; state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; } return 0; } /* * Adjust quota timers & warnings */ static int bch2_quota_set_info(struct super_block *sb, int type, struct qc_info *info) { struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; int ret = 0; if (0) { struct printbuf buf = PRINTBUF; qc_info_to_text(&buf, info); pr_info("setting:\n%s", buf.buf); printbuf_exit(&buf); } if (sb->s_flags & SB_RDONLY) return -EROFS; if (type >= QTYP_NR) return -EINVAL; if (!((1 << type) & enabled_qtypes(c))) return -ESRCH; if (info->i_fieldmask & ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) return -EINVAL; mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { ret = -BCH_ERR_ENOSPC_sb_quota; goto unlock; } if (info->i_fieldmask & QC_SPC_TIMER) sb_quota->q[type].c[Q_SPC].timelimit = cpu_to_le32(info->i_spc_timelimit); if (info->i_fieldmask & QC_SPC_WARNS) sb_quota->q[type].c[Q_SPC].warnlimit = cpu_to_le32(info->i_spc_warnlimit); if (info->i_fieldmask & QC_INO_TIMER) sb_quota->q[type].c[Q_INO].timelimit = cpu_to_le32(info->i_ino_timelimit); if (info->i_fieldmask & QC_INO_WARNS) sb_quota->q[type].c[Q_INO].warnlimit = cpu_to_le32(info->i_ino_warnlimit); bch2_sb_quota_read(c); bch2_write_super(c); unlock: mutex_unlock(&c->sb_lock); return bch2_err_class(ret); } /* Get/set individual quotas: */ static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) { dst->d_space = src->c[Q_SPC].v << 9; dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; dst->d_spc_timer = src->c[Q_SPC].timer; dst->d_spc_warns = src->c[Q_SPC].warns; dst->d_ino_count = src->c[Q_INO].v; dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; dst->d_ino_softlimit = src->c[Q_INO].softlimit; dst->d_ino_timer = src->c[Q_INO].timer; dst->d_ino_warns = src->c[Q_INO].warns; } static int bch2_get_quota(struct super_block *sb, struct kqid kqid, struct qc_dqblk *qdq) { struct bch_fs *c = sb->s_fs_info; struct bch_memquota_type *q = &c->quotas[kqid.type]; qid_t qid = from_kqid(&init_user_ns, kqid); struct bch_memquota *mq; memset(qdq, 0, sizeof(*qdq)); mutex_lock(&q->lock); mq = genradix_ptr(&q->table, qid); if (mq) __bch2_quota_get(qdq, mq); mutex_unlock(&q->lock); return 0; } static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, struct qc_dqblk *qdq) { struct bch_fs *c = sb->s_fs_info; struct bch_memquota_type *q = &c->quotas[kqid->type]; qid_t qid = from_kqid(&init_user_ns, *kqid); struct genradix_iter iter; struct bch_memquota *mq; int ret = 0; mutex_lock(&q->lock); genradix_for_each_from(&q->table, iter, mq, qid) if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { __bch2_quota_get(qdq, mq); *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); goto found; } ret = -ENOENT; found: mutex_unlock(&q->lock); return bch2_err_class(ret); } static int bch2_set_quota_trans(struct btree_trans *trans, struct bkey_i_quota *new_quota, struct qc_dqblk *qdq) { struct btree_iter iter; struct bkey_s_c k; int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, BTREE_ITER_slots|BTREE_ITER_intent); ret = bkey_err(k); if (unlikely(ret)) return ret; if (k.k->type == KEY_TYPE_quota) new_quota->v = *bkey_s_c_to_quota(k).v; if (qdq->d_fieldmask & QC_SPC_SOFT) new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); if (qdq->d_fieldmask & QC_SPC_HARD) new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); if (qdq->d_fieldmask & QC_INO_SOFT) new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); if (qdq->d_fieldmask & QC_INO_HARD) new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct qc_dqblk *qdq) { struct bch_fs *c = sb->s_fs_info; struct bkey_i_quota new_quota; int ret; if (0) { struct printbuf buf = PRINTBUF; qc_dqblk_to_text(&buf, qdq); pr_info("setting:\n%s", buf.buf); printbuf_exit(&buf); } if (sb->s_flags & SB_RDONLY) return -EROFS; bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); return bch2_err_class(ret); } const struct quotactl_ops bch2_quotactl_operations = { .quota_enable = bch2_quota_enable, .quota_disable = bch2_quota_disable, .rm_xquota = bch2_quota_remove, .get_state = bch2_quota_get_state, .set_info = bch2_quota_set_info, .get_dqblk = bch2_get_quota, .get_nextdqblk = bch2_get_next_quota, .set_dqblk = bch2_set_quota, }; #endif /* CONFIG_BCACHEFS_QUOTA */
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" /* * fsnotify_connp_t is what we embed in objects which connector can be attached * to. */ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return real_mount(conn->obj); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return conn->obj; } static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return ((struct inode *)obj)->i_sb; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return ((struct vfsmount *)obj)->mnt_sb; case FSNOTIFY_OBJ_TYPE_SB: return (struct super_block *)obj; default: return NULL; } } static inline struct super_block *fsnotify_connector_sb( struct fsnotify_mark_connector *conn) { return fsnotify_object_sb(conn->obj, conn->type); } static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); return sbinfo ? &sbinfo->sb_marks : NULL; } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void fsnotify_set_children_dentry_flags(struct inode *inode); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { return pages_per_huge_page(h) * sizeof(struct page); } /* * Return how many vmemmap size associated with a HugeTLB page that can be * optimized and can be freed to the buddy allocator. */ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; if (!is_power_of_2(sizeof(struct page))) return 0; return size > 0 ? size : 0; } #else static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } static long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { list_splice_init(folio_list, non_hvo_folios); return 0; } static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { } static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; } #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { return hugetlb_vmemmap_optimizable_size(h) != 0; } #endif /* _LINUX_HUGETLB_VMEMMAP_H */
69 848 838 30 848 850 836 30 30 1381 894 895 892 643 252 895 893 885 813 25 140 707 836 836 836 837 715 834 69 849 11 838 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Author: Mimi Zohar <zohar@us.ibm.com> * * File: ima_api.c * Implements must_appraise_or_measure, collect_measurement, * appraise_measurement, store_measurement and store_template. */ #include <linux/slab.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/evm.h> #include <linux/fsverity.h> #include "ima.h" /* * ima_free_template_entry - free an existing template entry */ void ima_free_template_entry(struct ima_template_entry *entry) { int i; for (i = 0; i < entry->template_desc->num_fields; i++) kfree(entry->template_data[i].data); kfree(entry->digests); kfree(entry); } /* * ima_alloc_init_template - create and initialize a new template entry */ int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *desc) { struct ima_template_desc *template_desc; struct tpm_digest *digests; int i, result = 0; if (desc) template_desc = desc; else template_desc = ima_template_desc_current(); *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); *entry = NULL; return -ENOMEM; } (*entry)->digests = digests; (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { const struct ima_template_field *field = template_desc->fields[i]; u32 len; result = field->field_init(event_data, &((*entry)->template_data[i])); if (result != 0) goto out; len = (*entry)->template_data[i].len; (*entry)->template_data_len += sizeof(len); (*entry)->template_data_len += len; } return 0; out: ima_free_template_entry(*entry); *entry = NULL; return result; } /* * ima_store_template - store ima template measurements * * Calculate the hash of a template entry, add the template entry * to an ordered list of measurement entries maintained inside the kernel, * and also update the aggregate integrity value (maintained inside the * configured TPM PCR) over the hashes of the current list of measurement * entries. * * Applications retrieve the current kernel-held measurement list through * the securityfs entries in /sys/kernel/security/ima. The signed aggregate * TPM PCR (called quote) can be retrieved using a TPM user space library * and is used to validate the measurement list. * * Returns 0 on success, error code otherwise */ int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "hashing_error"; char *template_name = entry->template_desc->name; int result; if (!violation) { result = ima_calc_field_array_hash(&entry->template_data[0], entry); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, template_name, op, audit_cause, result, 0); return result; } } entry->pcr = pcr; result = ima_add_template_entry(entry, violation, op, inode, filename); return result; } /* * ima_add_violation - add violation to measurement list. * * Violations are flagged in the measurement list with zero hash values. * By extending the PCR with 0xFF's instead of with zeroes, the PCR * value is invalidated. */ void ima_add_violation(struct file *file, const unsigned char *filename, struct ima_iint_cache *iint, const char *op, const char *cause) { struct ima_template_entry *entry; struct inode *inode = file_inode(file); struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .violation = cause }; int violation = 1; int result; /* can overflow, only indicator */ atomic_long_inc(&ima_htable.violations); result = ima_alloc_init_template(&event_data, &entry, NULL); if (result < 0) { result = -ENOMEM; goto err_out; } result = ima_store_template(entry, violation, inode, filename, CONFIG_IMA_MEASURE_PCR_IDX); if (result < 0) ima_free_template_entry(entry); err_out: integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, cause, result, 0); } /** * ima_get_action - appraise & measure decision based on policy. * @idmap: idmap of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @prop: properties of the task being validated * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC, * MAY_APPEND) * @func: caller identifier * @pcr: pointer filled in if matched measure policy sets pcr= * @template_desc: pointer filled in if matched measure policy sets template= * @func_data: func specific data, may be NULL * @allowed_algos: allowlist of hash algorithms for the IMA xattr * * The policy is defined in terms of keypairs: * subj=, obj=, type=, func=, mask=, fsmagic= * subj,obj, and type: are LSM specific. * func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK * | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA | SETXATTR_CHECK * | MMAP_CHECK_REQPROT * mask: contains the permission mask * fsmagic: hex value * * Returns IMA_MEASURE, IMA_APPRAISE mask. * */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH; flags &= ima_policy_flag; return ima_match_policy(idmap, inode, cred, prop, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } static bool ima_get_verity_digest(struct ima_iint_cache *iint, struct inode *inode, struct ima_max_digest_data *hash) { enum hash_algo alg; int digest_len; /* * On failure, 'measure' policy rules will result in a file data * hash containing 0's. */ digest_len = fsverity_get_digest(inode, hash->digest, NULL, &alg); if (digest_len == 0) return false; /* * Unlike in the case of actually calculating the file hash, in * the fsverity case regardless of the hash algorithm, return * the verity digest to be included in the measurement list. A * mismatch between the verity algorithm and the xattr signature * algorithm, if one exists, will be detected later. */ hash->hdr.algo = alg; hash->hdr.length = digest_len; return true; } /* * ima_collect_measurement - collect file measurement * * Calculate the file hash, if it doesn't already exist, * storing the measurement and i_version in the iint. * * Must be called with iint->mutex held. * * Return 0 on success, error code otherwise */ int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig) { const char *audit_cause = "failed"; struct inode *inode = file_inode(file); struct inode *real_inode = d_real_inode(file_dentry(file)); struct ima_max_digest_data hash; struct ima_digest_data *hash_hdr = container_of(&hash.hdr, struct ima_digest_data, hdr); struct name_snapshot filename; struct kstat stat; int result = 0; int length; void *tmpbuf; u64 i_version = 0; /* * Always collect the modsig, because IMA might have already collected * the file digest without collecting the modsig in a previous * measurement rule. */ if (modsig) ima_collect_modsig(modsig, buf, size); if (iint->flags & IMA_COLLECTED) goto out; /* * Detecting file change is based on i_version. On filesystems * which do not support i_version, support was originally limited * to an initial measurement/appraisal/audit, but was modified to * assume the file changed. */ result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); if (!result && (stat.result_mask & STATX_CHANGE_COOKIE)) i_version = stat.change_cookie; hash.hdr.algo = algo; hash.hdr.length = hash_digest_size[algo]; /* Initialize hash digest to 0's in case of failure */ memset(&hash.digest, 0, sizeof(hash.digest)); if (iint->flags & IMA_VERITY_REQUIRED) { if (!ima_get_verity_digest(iint, inode, &hash)) { audit_cause = "no-verity-digest"; result = -ENODATA; } } else if (buf) { result = ima_calc_buffer_hash(buf, size, hash_hdr); } else { result = ima_calc_file_hash(file, hash_hdr); } if (result && result != -EBADF && result != -EINVAL) goto out; length = sizeof(hash.hdr) + hash.hdr.length; tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS); if (!tmpbuf) { result = -ENOMEM; goto out; } iint->ima_hash = tmpbuf; memcpy(iint->ima_hash, &hash, length); if (real_inode == inode) iint->real_inode.version = i_version; else integrity_inode_attrs_store(&iint->real_inode, i_version, real_inode); /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; out: if (result) { if (file->f_flags & O_DIRECT) audit_cause = "failed(directio)"; take_dentry_name_snapshot(&filename, file->f_path.dentry); integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename.name.name, "collect_data", audit_cause, result, 0); release_dentry_name_snapshot(&filename); } return result; } /* * ima_store_measurement - store file measurement * * Create an "ima" template and then store the template by calling * ima_store_template. * * We only get here if the inode has not already been measured, * but the measurement could already exist: * - multiple copies of the same file on either the same or * different filesystems. * - the inode was previously flushed as well as the iint info, * containing the hashing info. * * Must be called with iint->mutex held. */ void ima_store_measurement(struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "ENOMEM"; int result = -ENOMEM; struct inode *inode = file_inode(file); struct ima_template_entry *entry; struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .xattr_value = xattr_value, .xattr_len = xattr_len, .modsig = modsig }; int violation = 0; /* * We still need to store the measurement in the case of MODSIG because * we only have its contents to put in the list at the time of * appraisal, but a file measurement from earlier might already exist in * the measurement list. */ if (iint->measured_pcrs & (0x1 << pcr) && !modsig) return; result = ima_alloc_init_template(&event_data, &entry, template_desc); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, 0); return; } result = ima_store_template(entry, violation, inode, filename, pcr); if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) { iint->flags |= IMA_MEASURED; iint->measured_pcrs |= (0x1 << pcr); } if (result < 0) ima_free_template_entry(entry); } void ima_audit_measurement(struct ima_iint_cache *iint, const unsigned char *filename) { struct audit_buffer *ab; char *hash; const char *algo_name = hash_algo_name[iint->ima_hash->algo]; int i; if (iint->flags & IMA_AUDITED) return; hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL); if (!hash) return; for (i = 0; i < iint->ima_hash->length; i++) hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]); hash[i * 2] = '\0'; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_INTEGRITY_RULE); if (!ab) goto out; audit_log_format(ab, "file="); audit_log_untrustedstring(ab, filename); audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash); audit_log_task_info(ab); audit_log_end(ab); iint->flags |= IMA_AUDITED; out: kfree(hash); return; } /* * ima_d_path - return a pointer to the full pathname * * Attempt to return a pointer to the full pathname for use in the * IMA measurement list, IMA audit records, and auditing logs. * * On failure, return a pointer to a copy of the filename, not dname. * Returning a pointer to dname, could result in using the pointer * after the memory has been freed. */ const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf) { struct name_snapshot filename; char *pathname = NULL; *pathbuf = __getname(); if (*pathbuf) { pathname = d_absolute_path(path, *pathbuf, PATH_MAX); if (IS_ERR(pathname)) { __putname(*pathbuf); *pathbuf = NULL; pathname = NULL; } } if (!pathname) { take_dentry_name_snapshot(&filename, path->dentry); strscpy(namebuf, filename.name.name, NAME_MAX); release_dentry_name_snapshot(&filename); pathname = namebuf; } return pathname; }
26 26 20 20 20 63 73 63 63 51 468 470 27 460 460 461 63 47 243 116 458 13 451 136 2 406 56 406 407 62 148 100 50 317 316 401 12 76 403 403 1 404 101 316 170 7 73 102 186 186 95 101 323 235 210 6 91 201 201 235 234 24 108 22 12 210 130 210 197 22 22 5 22 3 13 3 3 6 6 3 1 6 203 132 112 204 23 81 204 204 203 203 204 204 92 140 80 24 94 257 186 120 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 // SPDX-License-Identifier: GPL-2.0 /* * fs/mpage.c * * Copyright (C) 2002, Linus Torvalds. * * Contains functions related to preparing and submitting BIOs which contain * multiple pagecache pages. * * 15May2002 Andrew Morton * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "internal.h" /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_read_end_io(struct bio *bio) { struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, err == 0); bio_put(bio); } static void mpage_write_end_io(struct bio *bio) { struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) { if (err) mapping_set_error(fi.folio->mapping, err); folio_end_writeback(fi.folio); } bio_put(bio); } static struct bio *mpage_bio_submit_read(struct bio *bio) { bio->bi_end_io = mpage_read_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; } static struct bio *mpage_bio_submit_write(struct bio *bio) { bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; } /* * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows read_folio to avoid triggering a duplicate call * to get_block. * * The idea is to avoid adding buffers to pages that don't already have * them. So when the buffer is up to date and the page size == block size, * this marks the page up to date instead of adding new buffers. */ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, int page_block) { struct inode *inode = folio->mapping->host; struct buffer_head *page_bh, *head; int block = 0; head = folio_buffers(folio); if (!head) { /* * don't make any buffers if there is only one buffer on * the folio and the folio just needs to be set up to date */ if (inode->i_blkbits == PAGE_SHIFT && buffer_uptodate(bh)) { folio_mark_uptodate(folio); return; } head = create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; do { if (block == page_block) { page_bh->b_state = bh->b_state; page_bh->b_bdev = bh->b_bdev; page_bh->b_blocknr = bh->b_blocknr; break; } page_bh = page_bh->b_this_page; block++; } while (page_bh != head); } struct mpage_readpage_args { struct bio *bio; struct folio *folio; unsigned int nr_pages; bool is_readahead; sector_t last_block_in_bio; struct buffer_head map_bh; unsigned long first_logical_block; get_block_t *get_block; }; /* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the * blocks are not contiguous on the disk. * * We pass a buffer_head back and forth and use its buffer_mapped() flag to * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) { struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t first_block; unsigned page_block; unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; int length; int fully_mapped = 1; blk_opf_t opf = REQ_OP_READ; unsigned nblocks; unsigned relative_block; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); /* MAX_BUF_PER_PAGE, for example */ VM_BUG_ON_FOLIO(folio_test_large(folio), folio); if (args->is_readahead) { opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; } if (folio_buffers(folio)) goto confused; block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + args->nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; if (buffer_mapped(map_bh) && block_in_file > args->first_logical_block && block_in_file < (args->first_logical_block + nblocks)) { unsigned map_offset = block_in_file - args->first_logical_block; unsigned last = nblocks - map_offset; first_block = map_bh->b_blocknr + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); break; } if (page_block == blocks_per_page) break; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } /* * Then do more get_blocks calls until we are done with this folio. */ map_bh->b_folio = folio; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0; if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; if (args->get_block(inode, block_in_file, map_bh, 0)) goto confused; args->first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_folio copies the data * we just collected from get_block into the folio's buffers * so read_folio doesn't have to repeat the get_block call */ if (buffer_uptodate(map_bh)) { map_buffer_to_folio(folio, map_bh, page_block); goto confused; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (!page_block) first_block = map_bh->b_blocknr; else if (first_block + page_block != map_bh->b_blocknr) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; } else if (page_block == blocks_per_page) break; page_block++; block_in_file++; } bdev = map_bh->b_bdev; } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { folio_mark_uptodate(folio); folio_unlock(folio); goto out; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != first_block - 1)) args->bio = mpage_bio_submit_read(args->bio); alloc_new: if (args->bio == NULL) { args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) goto confused; args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); } length = first_hole << blkbits; if (!bio_add_folio(args->bio, folio, length, 0)) { args->bio = mpage_bio_submit_read(args->bio); goto alloc_new; } relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) args->bio = mpage_bio_submit_read(args->bio); else args->last_block_in_bio = first_block + blocks_per_page - 1; out: return args->bio; confused: if (args->bio) args->bio = mpage_bio_submit_read(args->bio); if (!folio_test_uptodate(folio)) block_read_full_folio(folio, args->get_block); else folio_unlock(folio); goto out; } /** * mpage_readahead - start reads against pages * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and * emitting large BIOs. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * * BH_Boundary explanation: * * There is a problem. The mpage read code assembles several pages, gets all * their disk mappings, and then submits them all. That's fine, but obtaining * the disk mappings may require I/O. Reads of indirect blocks, for example. * * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be * submitted in the following order: * * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 * * because the indirect block has to be read to get the mappings of blocks * 13,14,15,16. Obviously, this impacts performance. * * So what we do it to allow the filesystem's get_block() function to set * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block * after this one will require I/O against a block which is probably close to * this one. So you should push what I/O you have currently accumulated. * * This all causes the disk requests to be issued in the correct order. */ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { struct folio *folio; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; while ((folio = readahead_folio(rac))) { prefetchw(&folio->flags); args.folio = folio; args.nr_pages = readahead_count(rac); args.bio = do_mpage_readpage(&args); } if (args.bio) mpage_bio_submit_read(args.bio); } EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all */ int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { .folio = folio, .nr_pages = 1, .get_block = get_block, }; args.bio = do_mpage_readpage(&args); if (args.bio) mpage_bio_submit_read(args.bio); return 0; } EXPORT_SYMBOL(mpage_read_folio); /* * Writing is not so simple. * * If the page has buffers then they will be used for obtaining the disk * mapping. We only support pages which are fully mapped-and-dirty, with a * special case for pages which are unmapped at the end: end-of-file. * * If the page has no buffers (preferred) then the page is mapped here. * * If all blocks are found to be contiguous then the page can go into the * BIO. Otherwise fall back to the mapping's writepage(). * * FIXME: This code wants an estimate of how many pages are still to be * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ struct mpage_data { struct bio *bio; sector_t last_block_in_bio; get_block_t *get_block; }; /* * We have our BIO, so we can now mark the buffers clean. Make * sure to only clean buffers which we know we'll be writing. */ static void clean_buffers(struct folio *folio, unsigned first_unmapped) { unsigned buffer_counter = 0; struct buffer_head *bh, *head = folio_buffers(folio); if (!head) return; bh = head; do { if (buffer_counter++ == first_unmapped) break; clear_buffer_dirty(bh); bh = bh->b_this_page; } while (bh != head); /* * we cannot drop the bh if the page is not uptodate or a concurrent * read_folio would fail to serialize with the bh and it would read from * disk before we reach the platter. */ if (buffer_heads_over_limit && folio_test_uptodate(folio)) try_to_free_buffers(folio); } static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; sector_t first_block; unsigned page_block; unsigned first_unmapped = blocks_per_page; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; struct buffer_head *head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ page_block = 0; do { BUG_ON(buffer_locked(bh)); if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by * block_dirty_folio -> mmapped data */ if (buffer_dirty(bh)) goto confused; if (first_unmapped == blocks_per_page) first_unmapped = page_block; continue; } if (first_unmapped != blocks_per_page) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) goto confused; if (page_block) { if (bh->b_blocknr != first_block + page_block) goto confused; } else { first_block = bh->b_blocknr; } page_block++; boundary = buffer_boundary(bh); if (boundary) { boundary_block = bh->b_blocknr; boundary_bdev = bh->b_bdev; } bdev = bh->b_bdev; } while ((bh = bh->b_this_page) != head); if (first_unmapped) goto page_is_mapped; /* * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_folio(). If this address_space is also * using mpage_readahead then this can rarely happen. */ goto confused; } /* * The page has no buffers: map it to disk */ BUG_ON(!folio_test_uptodate(folio)); block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. */ if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (!buffer_mapped(&map_bh)) goto confused; if (buffer_new(&map_bh)) clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; } if (page_block) { if (map_bh.b_blocknr != first_block + page_block) goto confused; } else { first_block = map_bh.b_blocknr; } page_block++; boundary = buffer_boundary(&map_bh); bdev = map_bh.b_bdev; if (block_in_file == last_block) break; block_in_file++; } BUG_ON(page_block == 0); first_unmapped = page_block; page_is_mapped: /* Don't bother writing beyond EOF, truncate will discard the folio */ if (folio_pos(folio) >= i_size) goto confused; length = folio_size(folio); if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not * written out to the file." */ length = i_size - folio_pos(folio); folio_zero_segment(folio, length, folio_size(folio)); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != first_block - 1) bio = mpage_bio_submit_write(bio); alloc_new: if (bio == NULL) { bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; } /* * Must try to add the page before marking the buffer clean or * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); goto alloc_new; } clean_buffers(folio, first_unmapped); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { mpd->last_block_in_bio = first_block + blocks_per_page - 1; } goto out; confused: if (bio) bio = mpage_bio_submit_write(bio); /* * The caller has a ref on the inode, so *mapping is stable */ ret = block_write_full_folio(folio, wbc, mpd->get_block); mapping_set_error(mapping, ret); out: mpd->bio = bio; return ret; } /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. * * This is a library function, which implements the writepages() * address_space_operation. */ int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { struct mpage_data mpd = { .get_block = get_block, }; struct blk_plug plug; int ret; blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(mpage_writepages);
73 173 99 6 127 266 266 267 267 95 95 95 80 41 89 95 95 21 95 95 95 7 6 3 6 266 122 258 4 257 256 256 256 198 5 257 31 107 169 169 169 1 46 46 190 4 155 36 181 25 13 13 13 1 1 185 184 185 183 5 1 180 173 17 17 7 17 17 185 2 185 14 14 14 12 12 1 14 12 15 15 266 15 122 185 124 192 6 6 29 28 29 2 27 4 21 21 17 2 4 6 6 6 6 6 6 6 22 29 6 22 6 122 21 1 21 21 1 98 21 21 13 72 93 99 99 99 99 73 13 21 72 88 17 99 83 21 20 18 21 1 28 1 22 30 7 15 1 15 15 15 6 10 2 11 14 2 8 9 9 9 45 30 15 34 34 29 5 34 262 263 132 195 256 257 119 119 118 119 11 1 11 11 10 11 11 12 12 4 4 4 4 4 4 19 19 19 19 19 16 12 4 19 19 19 19 21 21 24 24 24 26 26 5 21 21 21 20 24 24 3 21 24 24 5 19 24 5 19 24 5 19 24 1 24 6 19 25 25 25 13 13 9 4 13 13 31 26 1 23 1 8 8 3 6 9 9 9 23 9 29 30 30 30 88 30 112 118 113 113 11 3 2 2 11 11 11 11 8 8 3 7 10 5 5 11 11 5 5 131 131 36 95 11 118 11 8 1 26 22 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 // SPDX-License-Identifier: GPL-2.0-or-later /* * dir.c * * Creates, reads, walks and deletes directory-nodes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. * * Portions of this code from linux/fs/ext3/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/fs.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/quotaops.h> #include <linux/sort.h> #include <linux/iversion.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "blockcheck.h" #include "dir.h" #include "dlmglue.h" #include "extent_map.h" #include "file.h" #include "inode.h" #include "journal.h" #include "namei.h" #include "suballoc.h" #include "super.h" #include "sysfile.h" #include "uptodate.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); static int ocfs2_dir_indexed(struct inode *inode); /* * These are distinct checks because future versions of the file system will * want to have a trailing dirent structure independent of indexing. */ static int ocfs2_supports_dir_trailer(struct inode *dir) { struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir); } /* * "new' here refers to the point at which we're creating a new * directory via "mkdir()", but also when we're expanding an inline * directory. In either case, we don't yet have the indexing bit set * on the directory, so the standard checks will fail in when metaecc * is turned off. Only directory-initialization type functions should * use this then. Everything else wants ocfs2_supports_dir_trailer() */ static int ocfs2_new_dir_wants_trailer(struct inode *dir) { struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); return ocfs2_meta_ecc(osb) || ocfs2_supports_indexed_dirs(osb); } static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) { return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer); } #define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb)))) /* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make * them more consistent? */ struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, void *data) { char *p = data; p += blocksize - sizeof(struct ocfs2_dir_block_trailer); return (struct ocfs2_dir_block_trailer *)p; } /* * XXX: This is executed once on every dirent. We should consider optimizing * it. */ static int ocfs2_skip_dir_trailer(struct inode *dir, struct ocfs2_dir_entry *de, unsigned long offset, unsigned long blklen) { unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); if (!ocfs2_supports_dir_trailer(dir)) return 0; if (offset != toff) return 0; return 1; } static void ocfs2_init_dir_trailer(struct inode *inode, struct buffer_head *bh, u16 rec_len) { struct ocfs2_dir_block_trailer *trailer; trailer = ocfs2_trailer_from_bh(bh, inode->i_sb); strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE); trailer->db_compat_rec_len = cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); trailer->db_blkno = cpu_to_le64(bh->b_blocknr); trailer->db_free_rec_len = cpu_to_le16(rec_len); } /* * Link an unindexed block with a dir trailer structure into the index free * list. This function will modify dirdata_bh, but assumes you've already * passed it to the journal. */ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, struct buffer_head *dx_root_bh, struct buffer_head *dirdata_bh) { int ret; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_block_trailer *trailer; ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; trailer->db_free_next = dx_root->dr_free_blk; dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); ocfs2_journal_dirty(handle, dx_root_bh); out: return ret; } static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res) { return res->dl_prev_leaf_bh == NULL; } void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res) { brelse(res->dl_dx_root_bh); brelse(res->dl_leaf_bh); brelse(res->dl_dx_leaf_bh); brelse(res->dl_prev_leaf_bh); } static int ocfs2_dir_indexed(struct inode *inode) { if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL) return 1; return 0; } static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root) { return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE; } /* * Hashing code adapted from ext3 */ #define DELTA 0x9E3779B9 static void TEA_transform(__u32 buf[4], __u32 const in[]) { __u32 sum = 0; __u32 b0 = buf[0], b1 = buf[1]; __u32 a = in[0], b = in[1], c = in[2], d = in[3]; int n = 16; do { sum += DELTA; b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); } while (--n); buf[0] += b0; buf[1] += b1; } static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; val = pad; if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; val = msg[i] + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; num--; } } if (--num >= 0) *buf++ = val; while (--num >= 0) *buf++ = pad; } static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len, struct ocfs2_dx_hinfo *hinfo) { struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); const char *p; __u32 in[8], buf[4]; /* * XXX: Is this really necessary, if the index is never looked * at by readdir? Is a hash value of '0' a bad idea? */ if ((len == 1 && !strncmp(".", name, 1)) || (len == 2 && !strncmp("..", name, 2))) { buf[0] = buf[1] = 0; goto out; } #ifdef OCFS2_DEBUG_DX_DIRS /* * This makes it very easy to debug indexing problems. We * should never allow this to be selected without hand editing * this file though. */ buf[0] = buf[1] = len; goto out; #endif memcpy(buf, osb->osb_dx_seed, sizeof(buf)); p = name; while (len > 0) { str2hashbuf(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; } out: hinfo->major_hash = buf[0]; hinfo->minor_hash = buf[1]; } /* * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ static int ocfs2_check_dir_entry(struct inode *dir, struct ocfs2_dir_entry *de, struct buffer_head *bh, char *buf, unsigned int size, unsigned long offset) { const char *error_msg = NULL; const int rlen = le16_to_cpu(de->rec_len); const unsigned long next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < OCFS2_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - OCFS2_DIR_REC_LEN(1)) && next_offset != size) error_msg = "directory entry too close to end"; if (unlikely(error_msg != NULL)) mlog(ML_ERROR, "bad entry in directory #%llu: %s - " "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, offset, (unsigned long long)le64_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; } static inline int ocfs2_match(int len, const char * const name, struct ocfs2_dir_entry *de) { if (len != de->name_len) return 0; if (!de->inode) return 0; return !memcmp(name, de->name, len); } /* * Returns 0 if not found, -1 on failure, and 1 on success */ static inline int ocfs2_search_dirblock(struct buffer_head *bh, struct inode *dir, const char *name, int namelen, unsigned long offset, char *first_de, unsigned int bytes, struct ocfs2_dir_entry **res_dir) { struct ocfs2_dir_entry *de; char *dlimit, *de_buf; int de_len; int ret = 0; de_buf = first_de; dlimit = de_buf + bytes; while (de_buf < dlimit - OCFS2_DIR_MEMBER_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ de = (struct ocfs2_dir_entry *) de_buf; if (de->name + namelen <= dlimit && ocfs2_match(namelen, name, de)) { /* found a match - just to be sure, do a full check */ if (!ocfs2_check_dir_entry(dir, de, bh, first_de, bytes, offset)) { ret = -1; goto bail; } *res_dir = de; ret = 1; goto bail; } /* prevent looping on a bad block */ de_len = le16_to_cpu(de->rec_len); if (de_len <= 0) { ret = -1; goto bail; } de_buf += de_len; offset += de_len; } bail: trace_ocfs2_search_dirblock(ret); return ret; } static struct buffer_head *ocfs2_find_entry_id(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_entry **res_dir) { int ret, found; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_inline_data *data; ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; } di = (struct ocfs2_dinode *)di_bh->b_data; data = &di->id2.i_data; found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0, data->id_data, i_size_read(dir), res_dir); if (found == 1) return di_bh; brelse(di_bh); out: return NULL; } static int ocfs2_validate_dir_block(struct super_block *sb, struct buffer_head *bh) { int rc; struct ocfs2_dir_block_trailer *trailer = ocfs2_trailer_from_bh(bh, sb); /* * We don't validate dirents here, that's handled * in-place when the code walks them. */ trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * If the ecc fails, we return the error but otherwise * leave the filesystem running. We know any error is * local to this block. * * Note that we are safe to call this even if the directory * doesn't have a trailer. Filesystems without metaecc will do * nothing, and filesystems with it will have one. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check); if (rc) mlog(ML_ERROR, "Checksum failed for dinode %llu\n", (unsigned long long)bh->b_blocknr); return rc; } /* * Validate a directory trailer. * * We check the trailer here rather than in ocfs2_validate_dir_block() * because that function doesn't have the inode to test. */ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh) { int rc = 0; struct ocfs2_dir_block_trailer *trailer; trailer = ocfs2_trailer_from_bh(bh, dir->i_sb); if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { rc = ocfs2_error(dir->i_sb, "Invalid dirblock #%llu: signature = %.*s\n", (unsigned long long)bh->b_blocknr, 7, trailer->db_signature); goto out; } if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) { rc = ocfs2_error(dir->i_sb, "Directory block #%llu has an invalid db_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } if (le64_to_cpu(trailer->db_parent_dinode) != OCFS2_I(dir)->ip_blkno) { rc = ocfs2_error(dir->i_sb, "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)le64_to_cpu(trailer->db_blkno)); goto out; } out: return rc; } /* * This function forces all errors to -EIO for consistency with its * predecessor, ocfs2_bread(). We haven't audited what returning the * real error codes would do to callers. We log the real codes with * mlog_errno() before we squash them. */ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, struct buffer_head **bh, int flags) { int rc = 0; struct buffer_head *tmp = *bh; rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, ocfs2_validate_dir_block); if (rc) { mlog_errno(rc); goto out; } if (!(flags & OCFS2_BH_READAHEAD) && ocfs2_supports_dir_trailer(inode)) { rc = ocfs2_check_dir_trailer(inode, tmp); if (rc) { if (!*bh) brelse(tmp); mlog_errno(rc); goto out; } } /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ if (!*bh) *bh = tmp; out: return rc ? -EIO : 0; } /* * Read the block at 'phys' which belongs to this directory * inode. This function does no virtual->physical block translation - * what's passed in is assumed to be a valid directory block. */ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, struct buffer_head **bh) { int ret; struct buffer_head *tmp = *bh; ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp, ocfs2_validate_dir_block); if (ret) { mlog_errno(ret); goto out; } if (ocfs2_supports_dir_trailer(dir)) { ret = ocfs2_check_dir_trailer(dir, tmp); if (ret) { if (!*bh) brelse(tmp); mlog_errno(ret); goto out; } } if (!ret && !*bh) *bh = tmp; out: return ret; } static int ocfs2_validate_dx_root(struct super_block *sb, struct buffer_head *bh) { int ret; struct ocfs2_dx_root_block *dx_root; BUG_ON(!buffer_uptodate(bh)); dx_root = (struct ocfs2_dx_root_block *) bh->b_data; ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check); if (ret) { mlog(ML_ERROR, "Checksum failed for dir index root block %llu\n", (unsigned long long)bh->b_blocknr); return ret; } if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) { ret = ocfs2_error(sb, "Dir Index Root # %llu has bad signature %.*s\n", (unsigned long long)le64_to_cpu(dx_root->dr_blkno), 7, dx_root->dr_signature); } return ret; } static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, struct buffer_head **dx_root_bh) { int ret; u64 blkno = le64_to_cpu(di->i_dx_root); struct buffer_head *tmp = *dx_root_bh; ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, ocfs2_validate_dx_root); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!ret && !*dx_root_bh) *dx_root_bh = tmp; return ret; } static int ocfs2_validate_dx_leaf(struct super_block *sb, struct buffer_head *bh) { int ret; struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data; BUG_ON(!buffer_uptodate(bh)); ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check); if (ret) { mlog(ML_ERROR, "Checksum failed for dir index leaf block %llu\n", (unsigned long long)bh->b_blocknr); return ret; } if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) { ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n", 7, dx_leaf->dl_signature); } return ret; } static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, struct buffer_head **dx_leaf_bh) { int ret; struct buffer_head *tmp = *dx_leaf_bh; ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, ocfs2_validate_dx_leaf); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!ret && !*dx_leaf_bh) *dx_leaf_bh = tmp; return ret; } /* * Read a series of dx_leaf blocks. This expects all buffer_head * pointers to be NULL on function entry. */ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, struct buffer_head **dx_leaf_bhs) { int ret; ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0, ocfs2_validate_dx_leaf); if (ret) mlog_errno(ret); return ret; } static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_entry **res_dir) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; unsigned long start, block, b; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ int ra_ptr = 0; /* Current index into readahead buffer */ int num = 0; int nblocks, i; sb = dir->i_sb; nblocks = i_size_read(dir) >> sb->s_blocksize_bits; start = OCFS2_I(dir)->ip_dir_start_lookup; if (start >= nblocks) start = 0; block = start; restart: do { /* * We deal with the read-ahead logic here. */ if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; b = block; for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { /* * Terminate if we reach the end of the * directory and must wrap, or if our * search has finished at this block. */ if (b >= nblocks || (num && block == start)) { bh_use[ra_max] = NULL; break; } num++; bh = NULL; ocfs2_read_dir_block(dir, b++, &bh, OCFS2_BH_READAHEAD); bh_use[ra_max] = bh; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; if (ocfs2_read_dir_block(dir, block, &bh, 0)) { /* read error, skip block & hope for the best. * ocfs2_read_dir_block() has released the bh. */ mlog(ML_ERROR, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); goto next; } i = ocfs2_search_dirblock(bh, dir, name, namelen, block << sb->s_blocksize_bits, bh->b_data, sb->s_blocksize, res_dir); if (i == 1) { OCFS2_I(dir)->ip_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); if (i < 0) goto cleanup_and_exit; } next: if (++block >= nblocks) block = 0; } while (block != start); /* * If the directory has grown while we were searching, then * search the last part of the directory before giving up. */ block = nblocks; nblocks = i_size_read(dir) >> sb->s_blocksize_bits; if (block < nblocks) { start = 0; goto restart; } cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); trace_ocfs2_find_entry_el(ret); return ret; } static int ocfs2_dx_dir_lookup_rec(struct inode *inode, struct ocfs2_extent_list *el, u32 major_hash, u32 *ret_cpos, u64 *ret_phys_blkno, unsigned int *ret_clen) { int ret = 0, i, found; struct buffer_head *eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec = NULL; if (el->l_tree_depth) { ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, &eb_bh); if (ret) { mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, "Inode %lu has non zero tree depth in btree tree block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; } } found = 0; for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { rec = &el->l_recs[i]; if (le32_to_cpu(rec->e_cpos) <= major_hash) { found = 1; break; } } if (!found) { ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in btree\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); goto out; } if (ret_phys_blkno) *ret_phys_blkno = le64_to_cpu(rec->e_blkno); if (ret_cpos) *ret_cpos = le32_to_cpu(rec->e_cpos); if (ret_clen) *ret_clen = le16_to_cpu(rec->e_leaf_clusters); out: brelse(eb_bh); return ret; } /* * Returns the block index, from the start of the cluster which this * hash belongs too. */ static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, u32 minor_hash) { return minor_hash & osb->osb_dx_mask; } static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb, struct ocfs2_dx_hinfo *hinfo) { return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash); } static int ocfs2_dx_dir_lookup(struct inode *inode, struct ocfs2_extent_list *el, struct ocfs2_dx_hinfo *hinfo, u32 *ret_cpos, u64 *ret_phys_blkno) { int ret = 0; unsigned int cend, clen; u32 cpos; u64 blkno; u32 name_hash = hinfo->major_hash; ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno, &clen); if (ret) { mlog_errno(ret); goto out; } cend = cpos + clen; if (name_hash >= cend) { /* We want the last cluster */ blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1); cpos += clen - 1; } else { blkno += ocfs2_clusters_to_blocks(inode->i_sb, name_hash - cpos); cpos = name_hash; } /* * We now have the cluster which should hold our entry. To * find the exact block from the start of the cluster to * search, we take the lower bits of the hash. */ blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo); if (ret_phys_blkno) *ret_phys_blkno = blkno; if (ret_cpos) *ret_cpos = cpos; out: return ret; } static int ocfs2_dx_dir_search(const char *name, int namelen, struct inode *dir, struct ocfs2_dx_root_block *dx_root, struct ocfs2_dir_lookup_result *res) { int ret, i, found; u64 phys; struct buffer_head *dx_leaf_bh = NULL; struct ocfs2_dx_leaf *dx_leaf; struct ocfs2_dx_entry *dx_entry = NULL; struct buffer_head *dir_ent_bh = NULL; struct ocfs2_dir_entry *dir_ent = NULL; struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo; struct ocfs2_extent_list *dr_el; struct ocfs2_dx_entry_list *entry_list; ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo); if (ocfs2_dx_root_inline(dx_root)) { entry_list = &dx_root->dr_entries; goto search; } dr_el = &dx_root->dr_list; ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name, hinfo->major_hash, hinfo->minor_hash, (unsigned long long)phys); ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); if (ret) { mlog_errno(ret); goto out; } dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; trace_ocfs2_dx_dir_search_leaf_info( le16_to_cpu(dx_leaf->dl_list.de_num_used), le16_to_cpu(dx_leaf->dl_list.de_count)); entry_list = &dx_leaf->dl_list; search: /* * Empty leaf is legal, so no need to check for that. */ found = 0; for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { dx_entry = &entry_list->de_entries[i]; if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash) || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash)) continue; /* * Search unindexed leaf block now. We're not * guaranteed to find anything. */ ret = ocfs2_read_dir_block_direct(dir, le64_to_cpu(dx_entry->dx_dirent_blk), &dir_ent_bh); if (ret) { mlog_errno(ret); goto out; } /* * XXX: We should check the unindexed block here, * before using it. */ found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen, 0, dir_ent_bh->b_data, dir->i_sb->s_blocksize, &dir_ent); if (found == 1) break; if (found == -1) { /* This means we found a bad directory entry. */ ret = -EIO; mlog_errno(ret); goto out; } brelse(dir_ent_bh); dir_ent_bh = NULL; } if (found <= 0) { ret = -ENOENT; goto out; } res->dl_leaf_bh = dir_ent_bh; res->dl_entry = dir_ent; res->dl_dx_leaf_bh = dx_leaf_bh; res->dl_dx_entry = dx_entry; ret = 0; out: if (ret) { brelse(dx_leaf_bh); brelse(dir_ent_bh); } return ret; } static int ocfs2_find_entry_dx(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_lookup_result *lookup) { int ret; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; } di = (struct ocfs2_dinode *)di_bh->b_data; ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); if (ret) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup); if (ret) { if (ret != -ENOENT) mlog_errno(ret); goto out; } lookup->dl_dx_root_bh = dx_root_bh; dx_root_bh = NULL; out: brelse(di_bh); brelse(dx_root_bh); return ret; } /* * Try to find an entry of the provided name within 'dir'. * * If nothing was found, -ENOENT is returned. Otherwise, zero is * returned and the struct 'res' will contain information useful to * other directory manipulation functions. * * Caller can NOT assume anything about the contents of the * buffer_heads - they are passed back only so that it can be passed * into any one of the manipulation functions (add entry, delete * entry, etc). As an example, bh in the extent directory case is a * data block, in the inline-data case it actually points to an inode, * in the indexed directory case, multiple buffers are involved. */ int ocfs2_find_entry(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_lookup_result *lookup) { struct buffer_head *bh; struct ocfs2_dir_entry *res_dir = NULL; int ret = 0; if (ocfs2_dir_indexed(dir)) return ocfs2_find_entry_dx(name, namelen, dir, lookup); if (unlikely(i_size_read(dir) <= 0)) { ret = -EFSCORRUPTED; mlog_errno(ret); goto out; } /* * The unindexed dir code only uses part of the lookup * structure, so there's no reason to push it down further * than this. */ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) { ret = -EFSCORRUPTED; mlog_errno(ret); goto out; } bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); } else { bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); } if (bh == NULL) return -ENOENT; lookup->dl_leaf_bh = bh; lookup->dl_entry = res_dir; out: return ret; } /* * Update inode number and type of a previously found directory entry. */ int ocfs2_update_entry(struct inode *dir, handle_t *handle, struct ocfs2_dir_lookup_result *res, struct inode *new_entry_inode) { int ret; ocfs2_journal_access_func access = ocfs2_journal_access_db; struct ocfs2_dir_entry *de = res->dl_entry; struct buffer_head *de_bh = res->dl_leaf_bh; /* * The same code works fine for both inline-data and extent * based directories, so no need to split this up. The only * difference is the journal_access function. */ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) access = ocfs2_journal_access_di; ret = access(handle, INODE_CACHE(dir), de_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno); ocfs2_set_de_type(de, new_entry_inode->i_mode); ocfs2_journal_dirty(handle, de_bh); out: return ret; } /* * __ocfs2_delete_entry deletes a directory entry by merging it with the * previous entry */ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, struct buffer_head *bh, char *first_de, unsigned int bytes) { struct ocfs2_dir_entry *de, *pde; int i, status = -ENOENT; ocfs2_journal_access_func access = ocfs2_journal_access_db; if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) access = ocfs2_journal_access_di; i = 0; pde = NULL; de = (struct ocfs2_dir_entry *) first_de; while (i < bytes) { if (!ocfs2_check_dir_entry(dir, de, bh, first_de, bytes, i)) { status = -EIO; mlog_errno(status); goto bail; } if (de == de_del) { status = access(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { status = -EIO; mlog_errno(status); goto bail; } if (pde) le16_add_cpu(&pde->rec_len, le16_to_cpu(de->rec_len)); de->inode = 0; inode_inc_iversion(dir); ocfs2_journal_dirty(handle, bh); goto bail; } i += le16_to_cpu(de->rec_len); pde = de; de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); } bail: return status; } static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de) { unsigned int hole; if (le64_to_cpu(de->inode) == 0) hole = le16_to_cpu(de->rec_len); else hole = le16_to_cpu(de->rec_len) - OCFS2_DIR_REC_LEN(de->name_len); return hole; } static int ocfs2_find_max_rec_len(struct super_block *sb, struct buffer_head *dirblock_bh) { int size, this_hole, largest_hole = 0; char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data; struct ocfs2_dir_entry *de; trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb); size = ocfs2_dir_trailer_blk_off(sb); limit = start + size; de_buf = start; de = (struct ocfs2_dir_entry *)de_buf; do { if (de_buf != trailer) { this_hole = ocfs2_figure_dirent_hole(de); if (this_hole > largest_hole) largest_hole = this_hole; } de_buf += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)de_buf; } while (de_buf < limit); if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) return largest_hole; return 0; } static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list, int index) { int num_used = le16_to_cpu(entry_list->de_num_used); if (num_used == 1 || index == (num_used - 1)) goto clear; memmove(&entry_list->de_entries[index], &entry_list->de_entries[index + 1], (num_used - index - 1)*sizeof(struct ocfs2_dx_entry)); clear: num_used--; memset(&entry_list->de_entries[num_used], 0, sizeof(struct ocfs2_dx_entry)); entry_list->de_num_used = cpu_to_le16(num_used); } static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, struct ocfs2_dir_lookup_result *lookup) { int ret, index, max_rec_len, add_to_free_list = 0; struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; struct buffer_head *leaf_bh = lookup->dl_leaf_bh; struct ocfs2_dx_leaf *dx_leaf; struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry; struct ocfs2_dir_block_trailer *trailer; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dx_entry_list *entry_list; /* * This function gets a bit messy because we might have to * modify the root block, regardless of whether the indexed * entries are stored inline. */ /* * *Only* set 'entry_list' here, based on where we're looking * for the indexed entries. Later, we might still want to * journal both blocks, based on free list state. */ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; if (ocfs2_dx_root_inline(dx_root)) { entry_list = &dx_root->dr_entries; } else { dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data; entry_list = &dx_leaf->dl_list; } /* Neither of these are a disk corruption - that should have * been caught by lookup, before we got here. */ BUG_ON(le16_to_cpu(entry_list->de_count) <= 0); BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0); index = (char *)dx_entry - (char *)entry_list->de_entries; index /= sizeof(*dx_entry); if (index >= le16_to_cpu(entry_list->de_num_used)) { mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, index, entry_list, dx_entry); return -EIO; } /* * We know that removal of this dirent will leave enough room * for a new one, so add this block to the free list if it * isn't already there. */ trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); if (trailer->db_free_rec_len == 0) add_to_free_list = 1; /* * Add the block holding our index into the journal before * removing the unindexed entry. If we get an error return * from __ocfs2_delete_entry(), then it hasn't removed the * entry yet. Likewise, successful return means we *must* * remove the indexed entry. * * We're also careful to journal the root tree block here as * the entry count needs to be updated. Also, we might be * adding to the start of the free list. */ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } if (!ocfs2_dx_root_inline(dx_root)) { ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), lookup->dl_dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } } trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno, index); ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, leaf_bh, leaf_bh->b_data, leaf_bh->b_size); if (ret) { mlog_errno(ret); goto out; } max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh); trailer->db_free_rec_len = cpu_to_le16(max_rec_len); if (add_to_free_list) { trailer->db_free_next = dx_root->dr_free_blk; dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr); ocfs2_journal_dirty(handle, dx_root_bh); } /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */ ocfs2_journal_dirty(handle, leaf_bh); le32_add_cpu(&dx_root->dr_num_entries, -1); ocfs2_journal_dirty(handle, dx_root_bh); ocfs2_dx_list_remove_entry(entry_list, index); if (!ocfs2_dx_root_inline(dx_root)) ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh); out: return ret; } static inline int ocfs2_delete_entry_id(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, struct buffer_head *bh) { int ret; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_inline_data *data; ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; } di = (struct ocfs2_dinode *)di_bh->b_data; data = &di->id2.i_data; ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data, i_size_read(dir)); brelse(di_bh); out: return ret; } static inline int ocfs2_delete_entry_el(handle_t *handle, struct inode *dir, struct ocfs2_dir_entry *de_del, struct buffer_head *bh) { return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data, bh->b_size); } /* * Delete a directory entry. Hide the details of directory * implementation from the caller. */ int ocfs2_delete_entry(handle_t *handle, struct inode *dir, struct ocfs2_dir_lookup_result *res) { if (ocfs2_dir_indexed(dir)) return ocfs2_delete_entry_dx(handle, dir, res); if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return ocfs2_delete_entry_id(handle, dir, res->dl_entry, res->dl_leaf_bh); return ocfs2_delete_entry_el(handle, dir, res->dl_entry, res->dl_leaf_bh); } /* * Check whether 'de' has enough room to hold an entry of * 'new_rec_len' bytes. */ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de, unsigned int new_rec_len) { unsigned int de_really_used; /* Check whether this is an empty record with enough space */ if (le64_to_cpu(de->inode) == 0 && le16_to_cpu(de->rec_len) >= new_rec_len) return 1; /* * Record might have free space at the end which we can * use. */ de_really_used = OCFS2_DIR_REC_LEN(de->name_len); if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len)) return 1; return 0; } static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf, struct ocfs2_dx_entry *dx_new_entry) { int i; i = le16_to_cpu(dx_leaf->dl_list.de_num_used); dx_leaf->dl_list.de_entries[i] = *dx_new_entry; le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1); } static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list, struct ocfs2_dx_hinfo *hinfo, u64 dirent_blk) { int i; struct ocfs2_dx_entry *dx_entry; i = le16_to_cpu(entry_list->de_num_used); dx_entry = &entry_list->de_entries[i]; memset(dx_entry, 0, sizeof(*dx_entry)); dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash); dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash); dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk); le16_add_cpu(&entry_list->de_num_used, 1); } static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, struct ocfs2_dx_hinfo *hinfo, u64 dirent_blk, struct buffer_head *dx_leaf_bh) { int ret; struct ocfs2_dx_leaf *dx_leaf; ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk); ocfs2_journal_dirty(handle, dx_leaf_bh); out: return ret; } static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle, struct ocfs2_dx_hinfo *hinfo, u64 dirent_blk, struct ocfs2_dx_root_block *dx_root) { ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk); } static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, struct ocfs2_dir_lookup_result *lookup) { int ret = 0; struct ocfs2_dx_root_block *dx_root; struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data; if (ocfs2_dx_root_inline(dx_root)) { ocfs2_dx_inline_root_insert(dir, handle, &lookup->dl_hinfo, lookup->dl_leaf_bh->b_blocknr, dx_root); } else { ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo, lookup->dl_leaf_bh->b_blocknr, lookup->dl_dx_leaf_bh); if (ret) goto out; } le32_add_cpu(&dx_root->dr_num_entries, 1); ocfs2_journal_dirty(handle, dx_root_bh); out: return ret; } static void ocfs2_remove_block_from_free_list(struct inode *dir, handle_t *handle, struct ocfs2_dir_lookup_result *lookup) { struct ocfs2_dir_block_trailer *trailer, *prev; struct ocfs2_dx_root_block *dx_root; struct buffer_head *bh; trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); if (ocfs2_free_list_at_root(lookup)) { bh = lookup->dl_dx_root_bh; dx_root = (struct ocfs2_dx_root_block *)bh->b_data; dx_root->dr_free_blk = trailer->db_free_next; } else { bh = lookup->dl_prev_leaf_bh; prev = ocfs2_trailer_from_bh(bh, dir->i_sb); prev->db_free_next = trailer->db_free_next; } trailer->db_free_rec_len = cpu_to_le16(0); trailer->db_free_next = cpu_to_le64(0); ocfs2_journal_dirty(handle, bh); ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); } /* * This expects that a journal write has been reserved on * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh */ static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle, struct ocfs2_dir_lookup_result *lookup) { int max_rec_len; struct ocfs2_dir_block_trailer *trailer; /* Walk dl_leaf_bh to figure out what the new free rec_len is. */ max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh); if (max_rec_len) { /* * There's still room in this block, so no need to remove it * from the free list. In this case, we just want to update * the rec len accounting. */ trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb); trailer->db_free_rec_len = cpu_to_le16(max_rec_len); ocfs2_journal_dirty(handle, lookup->dl_leaf_bh); } else { ocfs2_remove_block_from_free_list(dir, handle, lookup); } } /* we don't always have a dentry for what we want to add, so people * like orphan dir can call this instead. * * The lookup context must have been filled from * ocfs2_prepare_dir_for_insert. */ int __ocfs2_add_entry(handle_t *handle, struct inode *dir, const char *name, int namelen, struct inode *inode, u64 blkno, struct buffer_head *parent_fe_bh, struct ocfs2_dir_lookup_result *lookup) { unsigned long offset; unsigned short rec_len; struct ocfs2_dir_entry *de, *de1; struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; struct super_block *sb = dir->i_sb; int retval; unsigned int size = sb->s_blocksize; struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; if (ocfs2_dir_indexed(dir)) { struct buffer_head *bh; /* * An indexed dir may require that we update the free space * list. Reserve a write to the previous node in the list so * that we don't fail later. * * XXX: This can be either a dx_root_block, or an unindexed * directory tree leaf block. */ if (ocfs2_free_list_at_root(lookup)) { bh = lookup->dl_dx_root_bh; retval = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); } else { bh = lookup->dl_prev_leaf_bh; retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); } if (retval) { mlog_errno(retval); return retval; } } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { data_start = di->id2.i_data.id_data; size = i_size_read(dir); BUG_ON(insert_bh != parent_fe_bh); } rec_len = OCFS2_DIR_REC_LEN(namelen); offset = 0; de = (struct ocfs2_dir_entry *) data_start; while (1) { BUG_ON((char *)de >= (size + data_start)); /* These checks should've already been passed by the * prepare function, but I guess we can leave them * here anyway. */ if (!ocfs2_check_dir_entry(dir, de, insert_bh, data_start, size, offset)) { retval = -ENOENT; goto bail; } if (ocfs2_match(namelen, name, de)) { retval = -EEXIST; goto bail; } /* We're guaranteed that we should have space, so we * can't possibly have hit the trailer...right? */ mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size), "Hit dir trailer trying to insert %.*s " "(namelen %d) into directory %llu. " "offset is %lu, trailer offset is %d\n", namelen, name, namelen, (unsigned long long)parent_fe_bh->b_blocknr, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); goto bail; } if (insert_bh == parent_fe_bh) retval = ocfs2_journal_access_di(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (!retval && ocfs2_dir_indexed(dir)) retval = ocfs2_dx_dir_insert(dir, handle, lookup); } if (retval) { mlog_errno(retval); goto bail; } /* By now the buffer is marked for journaling */ offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { de1 = (struct ocfs2_dir_entry *)((char *) de + OCFS2_DIR_REC_LEN(de->name_len)); de1->rec_len = cpu_to_le16(le16_to_cpu(de->rec_len) - OCFS2_DIR_REC_LEN(de->name_len)); de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); de = de1; } de->file_type = FT_UNKNOWN; if (blkno) { de->inode = cpu_to_le64(blkno); ocfs2_set_de_type(de, inode->i_mode); } else de->inode = 0; de->name_len = namelen; memcpy(de->name, name, namelen); if (ocfs2_dir_indexed(dir)) ocfs2_recalc_free_list(dir, handle, lookup); inode_inc_iversion(dir); ocfs2_journal_dirty(handle, insert_bh); retval = 0; goto bail; } offset += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); } /* when you think about it, the assert above should prevent us * from ever getting here. */ retval = -ENOSPC; bail: if (retval) mlog_errno(retval); return retval; } static int ocfs2_dir_foreach_blk_id(struct inode *inode, u64 *f_version, struct dir_context *ctx) { int ret, i; unsigned long offset = ctx->pos; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); goto out; } di = (struct ocfs2_dinode *)di_bh->b_data; data = &di->id2.i_data; while (ctx->pos < i_size_read(inode)) { /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, *f_version)) { for (i = 0; i < i_size_read(inode) && i < offset; ) { de = (struct ocfs2_dir_entry *) (data->id_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (le16_to_cpu(de->rec_len) < OCFS2_DIR_REC_LEN(1)) break; i += le16_to_cpu(de->rec_len); } ctx->pos = offset = i; *f_version = inode_query_iversion(inode); } de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); if (!ocfs2_check_dir_entry(inode, de, di_bh, (char *)data->id_data, i_size_read(inode), ctx->pos)) { /* On error, skip the f_pos to the end. */ ctx->pos = i_size_read(inode); break; } offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), fs_ftype_to_dtype(de->file_type))) goto out; } ctx->pos += le16_to_cpu(de->rec_len); } out: brelse(di_bh); return 0; } /* * NOTE: This function can be called against unindexed directories, * and indexed ones. */ static int ocfs2_dir_foreach_blk_el(struct inode *inode, u64 *f_version, struct dir_context *ctx, bool persist) { unsigned long offset, blk, last_ra_blk = 0; int i; struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; int stored = 0; bh = NULL; offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < i_size_read(inode)) { blk = ctx->pos >> sb->s_blocksize_bits; if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { /* Skip the corrupt dirblock and keep trying */ ctx->pos += sb->s_blocksize - offset; continue; } /* The idea here is to begin with 8k read-ahead and to stay * 4k ahead of our current position. * * TODO: Use the pagecache for this. We just need to * make sure it's cluster-safe... */ if (!last_ra_blk || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { tmp = NULL; if (!ocfs2_read_dir_block(inode, ++blk, &tmp, OCFS2_BH_READAHEAD)) brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; } /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, *f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ocfs2_dir_entry *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (le16_to_cpu(de->rec_len) < OCFS2_DIR_REC_LEN(1)) break; i += le16_to_cpu(de->rec_len); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; *f_version = inode_query_iversion(inode); } while (ctx->pos < i_size_read(inode) && offset < sb->s_blocksize) { de = (struct ocfs2_dir_entry *) (bh->b_data + offset); if (!ocfs2_check_dir_entry(inode, de, bh, bh->b_data, sb->s_blocksize, offset)) { /* On error, skip the f_pos to the next block. */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } if (le64_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), fs_ftype_to_dtype(de->file_type))) { brelse(bh); return 0; } stored++; } offset += le16_to_cpu(de->rec_len); ctx->pos += le16_to_cpu(de->rec_len); } offset = 0; brelse(bh); bh = NULL; if (!persist && stored) break; } return 0; } static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, struct dir_context *ctx, bool persist) { if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return ocfs2_dir_foreach_blk_id(inode, f_version, ctx); return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist); } /* * This is intended to be called from inside other kernel functions, * so we fake some arguments. */ int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) { u64 version = inode_query_iversion(inode); ocfs2_dir_foreach_blk(inode, &version, ctx, true); return 0; } /* * ocfs2_readdir() * */ int ocfs2_readdir(struct file *file, struct dir_context *ctx) { int error = 0; struct inode *inode = file_inode(file); struct ocfs2_file_private *fp = file->private_data; int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention * on commonly accessed directories. */ ocfs2_inode_unlock(inode, 1); lock_level = 0; error = ocfs2_inode_lock(inode, NULL, 0); } if (error < 0) { if (error != -ENOENT) mlog_errno(error); /* we haven't got any yet, so propagate the error. */ goto bail_nolock; } error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false); ocfs2_inode_unlock(inode, lock_level); if (error) mlog_errno(error); bail_nolock: return error; } /* * NOTE: this should always be called with parent dir i_rwsem taken. */ int ocfs2_find_files_on_disk(const char *name, int namelen, u64 *blkno, struct inode *inode, struct ocfs2_dir_lookup_result *lookup) { int status = -ENOENT; trace_ocfs2_find_files_on_disk(namelen, name, blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_find_entry(name, namelen, inode, lookup); if (status) goto leave; *blkno = le64_to_cpu(lookup->dl_entry->inode); status = 0; leave: return status; } /* * Convenience function for callers which just want the block number * mapped to a name and don't require the full dirent info, etc. */ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno) { int ret; struct ocfs2_dir_lookup_result lookup = { NULL, }; ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup); ocfs2_free_dir_lookup_result(&lookup); return ret; } /* Check for a name within a directory. * * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name * Return -EFSCORRUPTED if found corruption * * Callers should have i_rwsem + a cluster lock on dir */ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen) { int ret = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); ret = ocfs2_find_entry(name, namelen, dir, &lookup); if (ret == 0) { ret = -EEXIST; mlog_errno(ret); } else if (ret == -ENOENT) { ret = 0; } ocfs2_free_dir_lookup_result(&lookup); return ret; } struct ocfs2_empty_dir_priv { struct dir_context ctx; unsigned seen_dot; unsigned seen_dot_dot; unsigned seen_other; unsigned dx_dir; }; static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { struct ocfs2_empty_dir_priv *p = container_of(ctx, struct ocfs2_empty_dir_priv, ctx); /* * Check the positions of "." and ".." records to be sure * they're in the correct place. * * Indexed directories don't need to proceed past the first * two entries, so we end the scan after seeing '..'. Despite * that, we allow the scan to proceed In the event that we * have a corrupted indexed directory (no dot or dot dot * entries). This allows us to double check for existing * entries which might not have been found in the index. */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; return true; } if (name_len == 2 && !strncmp("..", name, 2) && pos == OCFS2_DIR_REC_LEN(1)) { p->seen_dot_dot = 1; if (p->dx_dir && p->seen_dot) return false; return true; } p->seen_other = 1; return false; } static int ocfs2_empty_dir_dx(struct inode *inode, struct ocfs2_empty_dir_priv *priv) { int ret; struct buffer_head *di_bh = NULL; struct buffer_head *dx_root_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_dx_root_block *dx_root; priv->dx_dir = 1; ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; } di = (struct ocfs2_dinode *)di_bh->b_data; ret = ocfs2_read_dx_root(inode, di, &dx_root_bh); if (ret) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; if (le32_to_cpu(dx_root->dr_num_entries) != 2) priv->seen_other = 1; out: brelse(di_bh); brelse(dx_root_bh); return ret; } /* * routine to check that the specified directory is empty (for rmdir) * * Returns 1 if dir is empty, zero otherwise. * * XXX: This is a performance problem for unindexed directories. */ int ocfs2_empty_dir(struct inode *inode) { int ret; struct ocfs2_empty_dir_priv priv = { .ctx.actor = ocfs2_empty_dir_filldir, }; if (ocfs2_dir_indexed(inode)) { ret = ocfs2_empty_dir_dx(inode, &priv); if (ret) mlog_errno(ret); /* * We still run ocfs2_dir_foreach to get the checks * for "." and "..". */ } ret = ocfs2_dir_foreach(inode, &priv.ctx); if (ret) mlog_errno(ret); if (!priv.seen_dot || !priv.seen_dot_dot) { mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); /* * XXX: Is it really safe to allow an unlink to continue? */ return 1; } return !priv.seen_other; } /* * Fills "." and ".." dirents in a new directory block. Returns dirent for * "..", which might be used during creation of a directory with a trailing * header. It is otherwise safe to ignore the return code. */ static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode, struct inode *parent, char *start, unsigned int size) { struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); de->name_len = 1; de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); strcpy(de->name, "."); ocfs2_set_de_type(de, S_IFDIR); de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1)); de->name_len = 2; strcpy(de->name, ".."); ocfs2_set_de_type(de, S_IFDIR); return de; } /* * This works together with code in ocfs2_mknod_locked() which sets * the inline-data flag and initializes the inline-data section. */ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *di_bh) { int ret; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inline_data *data = &di->id2.i_data; unsigned int size = le16_to_cpu(data->id_count); ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); ocfs2_journal_dirty(handle, di_bh); i_size_write(inode, size); set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); out: return ret; } static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_alloc_context *data_ac, struct buffer_head **ret_new_bh) { int status; unsigned int size = osb->sb->s_blocksize; struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry *de; if (ocfs2_new_dir_wants_trailer(inode)) size = ocfs2_dir_trailer_blk_off(parent->i_sb); status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, data_ac, NULL, &new_bh); if (status < 0) { mlog_errno(status); goto bail; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(new_bh->b_data, 0, osb->sb->s_blocksize); de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); if (ocfs2_new_dir_wants_trailer(inode)) { int size = le16_to_cpu(de->rec_len); /* * Figure out the size of the hole left over after * insertion of '.' and '..'. The trailer wants this * information. */ size -= OCFS2_DIR_REC_LEN(2); size -= sizeof(struct ocfs2_dir_block_trailer); ocfs2_init_dir_trailer(inode, new_bh, size); } ocfs2_journal_dirty(handle, new_bh); i_size_write(inode, inode->i_sb->s_blocksize); set_nlink(inode, 2); inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { mlog_errno(status); goto bail; } status = 0; if (ret_new_bh) { *ret_new_bh = new_bh; new_bh = NULL; } bail: brelse(new_bh); return status; } static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, handle_t *handle, struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dirdata_bh, struct ocfs2_alloc_context *meta_ac, int dx_inline, u32 num_entries, struct buffer_head **ret_dx_root_bh) { int ret; struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; u16 dr_suballoc_bit; u64 suballoc_loc, dr_blkno; unsigned int num_bits; struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_block_trailer *trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &dr_suballoc_bit, &num_bits, &dr_blkno); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_dx_dir_attach_index( (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)dr_blkno); dx_root_bh = sb_getblk(osb->sb, dr_blkno); if (dx_root_bh == NULL) { ret = -ENOMEM; goto out; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; memset(dx_root, 0, osb->sb->s_blocksize); strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc); dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); dx_root->dr_blkno = cpu_to_le64(dr_blkno); dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno); dx_root->dr_num_entries = cpu_to_le32(num_entries); if (le16_to_cpu(trailer->db_free_rec_len)) dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr); else dx_root->dr_free_blk = cpu_to_le64(0); if (dx_inline) { dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE; dx_root->dr_entries.de_count = cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb)); } else { dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); } ocfs2_journal_dirty(handle, dx_root_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out; } di->i_dx_root = cpu_to_le64(dr_blkno); spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_journal_dirty(handle, di_bh); *ret_dx_root_bh = dx_root_bh; dx_root_bh = NULL; out: brelse(dx_root_bh); return ret; } static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, handle_t *handle, struct inode *dir, struct buffer_head **dx_leaves, int num_dx_leaves, u64 start_blk) { int ret, i; struct ocfs2_dx_leaf *dx_leaf; struct buffer_head *bh; for (i = 0; i < num_dx_leaves; i++) { bh = sb_getblk(osb->sb, start_blk + i); if (bh == NULL) { ret = -ENOMEM; goto out; } dx_leaves[i] = bh; ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh); ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); goto out; } dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data; memset(dx_leaf, 0, osb->sb->s_blocksize); strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE); dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation); dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr); dx_leaf->dl_list.de_count = cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); trace_ocfs2_dx_dir_format_cluster( (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)bh->b_blocknr, le16_to_cpu(dx_leaf->dl_list.de_count)); ocfs2_journal_dirty(handle, bh); } ret = 0; out: return ret; } /* * Allocates and formats a new cluster for use in an indexed dir * leaf. This version will not do the extent insert, so that it can be * used by operations which need careful ordering. */ static int __ocfs2_dx_dir_new_cluster(struct inode *dir, u32 cpos, handle_t *handle, struct ocfs2_alloc_context *data_ac, struct buffer_head **dx_leaves, int num_dx_leaves, u64 *ret_phys_blkno) { int ret; u32 phys, num; u64 phys_blkno; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); /* * XXX: For create, this should claim cluster for the index * *before* the unindexed insert so that we have a better * chance of contiguousness as the directory grows in number * of entries. */ ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num); if (ret) { mlog_errno(ret); goto out; } /* * Format the new cluster first. That way, we're inserting * valid data. */ phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys); ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves, num_dx_leaves, phys_blkno); if (ret) { mlog_errno(ret); goto out; } *ret_phys_blkno = phys_blkno; out: return ret; } static int ocfs2_dx_dir_new_cluster(struct inode *dir, struct ocfs2_extent_tree *et, u32 cpos, handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct buffer_head **dx_leaves, int num_dx_leaves) { int ret; u64 phys_blkno; ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, num_dx_leaves, &phys_blkno); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0, meta_ac); if (ret) mlog_errno(ret); out: return ret; } static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb, int *ret_num_leaves) { int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1); struct buffer_head **dx_leaves; dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *), GFP_NOFS); if (dx_leaves && ret_num_leaves) *ret_num_leaves = num_dx_leaves; return dx_leaves; } static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *di_bh, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac) { int ret; struct buffer_head *leaf_bh = NULL; struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_hinfo hinfo; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dx_entry_list *entry_list; /* * Our strategy is to create the directory as though it were * unindexed, then add the index block. This works with very * little complication since the state of a new directory is a * very well known quantity. * * Essentially, we have two dirents ("." and ".."), in the 1st * block which need indexing. These are easily inserted into * the index block. */ ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh, data_ac, &leaf_bh); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh, meta_ac, 1, 2, &dx_root_bh); if (ret) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; entry_list = &dx_root->dr_entries; /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */ ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo); ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo); ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr); out: brelse(dx_root_bh); brelse(leaf_bh); return ret; } int ocfs2_fill_new_dir(struct ocfs2_super *osb, handle_t *handle, struct inode *parent, struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac) { BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh); if (ocfs2_supports_indexed_dirs(osb)) return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh, data_ac, meta_ac); return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh, data_ac, NULL); } static int ocfs2_dx_dir_index_block(struct inode *dir, handle_t *handle, struct buffer_head **dx_leaves, int num_dx_leaves, u32 *num_dx_entries, struct buffer_head *dirent_bh) { int ret = 0, namelen, i; char *de_buf, *limit; struct ocfs2_dir_entry *de; struct buffer_head *dx_leaf_bh; struct ocfs2_dx_hinfo hinfo; u64 dirent_blk = dirent_bh->b_blocknr; de_buf = dirent_bh->b_data; limit = de_buf + dir->i_sb->s_blocksize; while (de_buf < limit) { de = (struct ocfs2_dir_entry *)de_buf; namelen = de->name_len; if (!namelen || !de->inode) goto inc; ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo); i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo); dx_leaf_bh = dx_leaves[i]; ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo, dirent_blk, dx_leaf_bh); if (ret) { mlog_errno(ret); goto out; } *num_dx_entries = *num_dx_entries + 1; inc: de_buf += le16_to_cpu(de->rec_len); } out: return ret; } /* * XXX: This expects dx_root_bh to already be part of the transaction. */ static void ocfs2_dx_dir_index_root_block(struct inode *dir, struct buffer_head *dx_root_bh, struct buffer_head *dirent_bh) { char *de_buf, *limit; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_entry *de; struct ocfs2_dx_hinfo hinfo; u64 dirent_blk = dirent_bh->b_blocknr; dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; de_buf = dirent_bh->b_data; limit = de_buf + dir->i_sb->s_blocksize; while (de_buf < limit) { de = (struct ocfs2_dir_entry *)de_buf; if (!de->name_len || !de->inode) goto inc; ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); trace_ocfs2_dx_dir_index_root_block( (unsigned long long)dir->i_ino, hinfo.major_hash, hinfo.minor_hash, de->name_len, de->name, le16_to_cpu(dx_root->dr_entries.de_num_used)); ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, dirent_blk); le32_add_cpu(&dx_root->dr_num_entries, 1); inc: de_buf += le16_to_cpu(de->rec_len); } } /* * Count the number of inline directory entries in di_bh and compare * them against the number of entries we can hold in an inline dx root * block. */ static int ocfs2_new_dx_should_be_inline(struct inode *dir, struct buffer_head *di_bh) { int dirent_count = 0; char *de_buf, *limit; struct ocfs2_dir_entry *de; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; de_buf = di->id2.i_data.id_data; limit = de_buf + i_size_read(dir); while (de_buf < limit) { de = (struct ocfs2_dir_entry *)de_buf; if (de->name_len && de->inode) dirent_count++; de_buf += le16_to_cpu(de->rec_len); } /* We are careful to leave room for one extra record. */ return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb); } /* * Expand rec_len of the rightmost dirent in a directory block so that it * contains the end of our valid space for dirents. We do this during * expansion from an inline directory to one with extents. The first dir block * in that case is taken from the inline data portion of the inode block. * * This will also return the largest amount of contiguous space for a dirent * in the block. That value is *not* necessarily the last dirent, even after * expansion. The directory indexing code wants this value for free space * accounting. We do this here since we're already walking the entire dir * block. * * We add the dir trailer if this filesystem wants it. */ static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size, struct inode *dir) { struct super_block *sb = dir->i_sb; struct ocfs2_dir_entry *de; struct ocfs2_dir_entry *prev_de; char *de_buf, *limit; unsigned int new_size = sb->s_blocksize; unsigned int bytes, this_hole; unsigned int largest_hole = 0; if (ocfs2_new_dir_wants_trailer(dir)) new_size = ocfs2_dir_trailer_blk_off(sb); bytes = new_size - old_size; limit = start + old_size; de_buf = start; de = (struct ocfs2_dir_entry *)de_buf; do { this_hole = ocfs2_figure_dirent_hole(de); if (this_hole > largest_hole) largest_hole = this_hole; prev_de = de; de_buf += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)de_buf; } while (de_buf < limit); le16_add_cpu(&prev_de->rec_len, bytes); /* We need to double check this after modification of the final * dirent. */ this_hole = ocfs2_figure_dirent_hole(prev_de); if (this_hole > largest_hole) largest_hole = this_hole; if (largest_hole >= OCFS2_DIR_MIN_REC_LEN) return largest_hole; return 0; } /* * We allocate enough clusters to fulfill "blocks_wanted", but set * i_size to exactly one block. Ocfs2_extend_dir() will handle the * rest automatically for us. * * *first_block_bh is a pointer to the 1st data block allocated to the * directory. */ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, unsigned int blocks_wanted, struct ocfs2_dir_lookup_result *lookup, struct buffer_head **first_block_bh) { u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0; struct super_block *sb = dir->i_sb; int ret, i, num_dx_leaves = 0, dx_inline = 0, credits = ocfs2_inline_to_extents_credits(sb); u64 dx_insert_blkno, blkno, bytes = blocks_wanted << sb->s_blocksize_bits; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(dir); struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; struct buffer_head *dirdata_bh = NULL; struct buffer_head *dx_root_bh = NULL; struct buffer_head **dx_leaves = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; struct ocfs2_extent_tree et; struct ocfs2_extent_tree dx_et; int did_quota = 0, bytes_allocated = 0; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); dx_alloc = 0; down_write(&oi->ip_alloc_sem); if (ocfs2_supports_indexed_dirs(osb)) { credits += ocfs2_add_dir_index_credits(sb); dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh); if (!dx_inline) { /* Add one more cluster for an index leaf */ dx_alloc++; dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb, &num_dx_leaves); if (!dx_leaves) { ret = -ENOMEM; mlog_errno(ret); goto out; } } /* This gets us the dx_root */ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { mlog_errno(ret); goto out; } } /* * We should never need more than 2 clusters for the unindexed * tree - maximum dirent size is far less than one block. In * fact, the only time we'd need more than one cluster is if * blocksize == clustersize and the dirent won't fit in the * extra space that the expansion to a single block gives. As * of today, that only happens on 4k/4k file systems. */ BUG_ON(alloc > 2); ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac); if (ret) { mlog_errno(ret); goto out; } /* * Prepare for worst case allocation scenario of two separate * extents in the unindexed tree. */ if (alloc == 2) credits += OCFS2_SUBALLOC_ALLOC; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = dquot_alloc_space_nodirty(dir, ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); if (ret) goto out_commit; did_quota = 1; if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { /* * Allocate our index cluster first, to maximize the * possibility that unindexed leaves grow * contiguously. */ ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, num_dx_leaves, &dx_insert_blkno); if (ret) { mlog_errno(ret); goto out_commit; } bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); } /* * Try to claim as many clusters as the bitmap can give though * if we only get one now, that's enough to continue. The rest * will be claimed after the conversion to extents. */ if (ocfs2_dir_resv_allowed(osb)) data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); goto out_commit; } bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); /* * Operations are carefully ordered so that we set up the new * data block first. The conversion from inline data to * extents follows. */ blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); dirdata_bh = sb_getblk(sb, blkno); if (!dirdata_bh) { ret = -ENOMEM; mlog_errno(ret); goto out_commit; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh); ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; } memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); memset(dirdata_bh->b_data + i_size_read(dir), 0, sb->s_blocksize - i_size_read(dir)); i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir); if (ocfs2_new_dir_wants_trailer(dir)) { /* * Prepare the dir trailer up front. It will otherwise look * like a valid dirent. Even if inserting the index fails * (unlikely), then all we'll have done is given first dir * block a small amount of fragmentation. */ ocfs2_init_dir_trailer(dir, dirdata_bh, i); } ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { /* * Dx dirs with an external cluster need to do this up * front. Inline dx root's get handled later, after * we've allocated our root block. We get passed back * a total number of items so that dr_num_entries can * be correctly set once the dx_root has been * allocated. */ ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves, num_dx_leaves, &num_dx_entries, dirdata_bh); if (ret) { mlog_errno(ret); goto out_commit; } } /* * Set extent, i_size, etc on the directory. After this, the * inode should contain the same exact dirents as before and * be fully accessible from system calls. * * We let the later dirent insert modify c/mtime - to the user * the data hasn't changed. */ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; } spin_lock(&oi->ip_lock); oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir)); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir)); ocfs2_update_inode_fsync_trans(handle, dir, 1); /* * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; } /* * Set i_blocks after the extent insert for the most up to * date ip_clusters value. */ dir->i_blocks = ocfs2_inode_sector_count(dir); ocfs2_journal_dirty(handle, di_bh); if (ocfs2_supports_indexed_dirs(osb)) { ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, dirdata_bh, meta_ac, dx_inline, num_dx_entries, &dx_root_bh); if (ret) { mlog_errno(ret); goto out_commit; } if (dx_inline) { ocfs2_dx_dir_index_root_block(dir, dx_root_bh, dirdata_bh); } else { ocfs2_init_dx_root_extent_tree(&dx_et, INODE_CACHE(dir), dx_root_bh); ret = ocfs2_insert_extent(handle, &dx_et, 0, dx_insert_blkno, 1, 0, NULL); if (ret) mlog_errno(ret); } } /* * We asked for two clusters, but only got one in the 1st * pass. Claim the 2nd cluster as a separate extent. */ if (alloc > len) { ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); goto out_commit; } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); ret = ocfs2_insert_extent(handle, &et, 1, blkno, len, 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; } bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1); } *first_block_bh = dirdata_bh; dirdata_bh = NULL; if (ocfs2_supports_indexed_dirs(osb)) { unsigned int off; if (!dx_inline) { /* * We need to return the correct block within the * cluster which should hold our entry. */ off = ocfs2_dx_dir_hash_idx(osb, &lookup->dl_hinfo); get_bh(dx_leaves[off]); lookup->dl_dx_leaf_bh = dx_leaves[off]; } lookup->dl_dx_root_bh = dx_root_bh; dx_root_bh = NULL; } out_commit: if (ret < 0 && did_quota) dquot_free_space_nodirty(dir, bytes_allocated); ocfs2_commit_trans(osb, handle); out: up_write(&oi->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); if (meta_ac) ocfs2_free_alloc_context(meta_ac); if (dx_leaves) { for (i = 0; i < num_dx_leaves; i++) brelse(dx_leaves[i]); kfree(dx_leaves); } brelse(dirdata_bh); brelse(dx_root_bh); return ret; } /* returns a bh of the 1st new block in the allocation. */ static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh) { int status; int extend, did_quota = 0; u64 p_blkno, v_blkno; spin_lock(&OCFS2_I(dir)->ip_lock); extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); spin_unlock(&OCFS2_I(dir)->ip_lock); if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; status = dquot_alloc_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); if (status) goto bail; did_quota = 1; status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 1, 0, parent_fe_bh, handle, data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { mlog_errno(status); goto bail; } } v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto bail; } *new_bh = sb_getblk(sb, p_blkno); if (!*new_bh) { status = -ENOMEM; mlog_errno(status); goto bail; } status = 0; bail: if (did_quota && status < 0) dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); return status; } /* * Assumes you already have a cluster lock on the directory. * * 'blocks_wanted' is only used if we have an inline directory which * is to be turned into an extent based one. The size of the dirent to * insert might be larger than the space gained by growing to just one * block, so we may have to grow the inode by two blocks in that case. * * If the directory is already indexed, dx_root_bh must be provided. */ static int ocfs2_extend_dir(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, unsigned int blocks_wanted, struct ocfs2_dir_lookup_result *lookup, struct buffer_head **new_de_bh) { int status = 0; int credits, num_free_extents, drop_alloc_sem = 0; loff_t dir_i_size; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; struct ocfs2_extent_list *el = &fe->id2.i_list; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; struct ocfs2_extent_tree et; struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { /* * This would be a code error as an inline directory should * never have an index root. */ BUG_ON(dx_root_bh); status = ocfs2_expand_inline_dir(dir, parent_fe_bh, blocks_wanted, lookup, &new_bh); if (status) { mlog_errno(status); goto bail; } /* Expansion from inline to an indexed directory will * have given us this. */ dx_root_bh = lookup->dl_dx_root_bh; if (blocks_wanted == 1) { /* * If the new dirent will fit inside the space * created by pushing out to one block, then * we can complete the operation * here. Otherwise we have to expand i_size * and format the 2nd block below. */ BUG_ON(new_bh == NULL); goto bail_bh; } /* * Get rid of 'new_bh' - we want to format the 2nd * data block and return that instead. */ brelse(new_bh); new_bh = NULL; down_write(&OCFS2_I(dir)->ip_alloc_sem); drop_alloc_sem = 1; dir_i_size = i_size_read(dir); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; goto do_extend; } down_write(&OCFS2_I(dir)->ip_alloc_sem); drop_alloc_sem = 1; dir_i_size = i_size_read(dir); trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); /* dir->i_size is always block aligned. */ spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), parent_fe_bh); num_free_extents = ocfs2_num_free_extents(&et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); goto bail; } if (!num_free_extents) { status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } } status = ocfs2_reserve_clusters(osb, 1, &data_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } if (ocfs2_dir_resv_allowed(osb)) data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; credits = ocfs2_calc_extend_credits(sb, el); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; } do_extend: if (ocfs2_dir_indexed(dir)) credits++; /* For attaching the new dirent block to the * dx_root */ handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; mlog_errno(status); goto bail; } status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, data_ac, meta_ac, &new_bh); if (status < 0) { mlog_errno(status); goto bail; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh); status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(new_bh->b_data, 0, sb->s_blocksize); de = (struct ocfs2_dir_entry *) new_bh->b_data; de->inode = 0; if (ocfs2_supports_dir_trailer(dir)) { de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len)); if (ocfs2_dir_indexed(dir)) { status = ocfs2_dx_dir_link_trailer(dir, handle, dx_root_bh, new_bh); if (status) { mlog_errno(status); goto bail; } } } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; i_size_write(dir, dir_i_size); dir->i_blocks = ocfs2_inode_sector_count(dir); status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (status < 0) { mlog_errno(status); goto bail; } bail_bh: *new_de_bh = new_bh; get_bh(*new_de_bh); bail: if (handle) ocfs2_commit_trans(osb, handle); if (drop_alloc_sem) up_write(&OCFS2_I(dir)->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); if (meta_ac) ocfs2_free_alloc_context(meta_ac); brelse(new_bh); return status; } static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, const char *name, int namelen, struct buffer_head **ret_de_bh, unsigned int *blocks_wanted) { int ret; struct super_block *sb = dir->i_sb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dir_entry *de, *last_de = NULL; char *first_de, *de_buf, *limit; unsigned long offset = 0; unsigned int rec_len, new_rec_len, free_space; /* * This calculates how many free bytes we'd have in block zero, should * this function force expansion to an extent tree. */ if (ocfs2_new_dir_wants_trailer(dir)) free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); else free_space = dir->i_sb->s_blocksize - i_size_read(dir); first_de = di->id2.i_data.id_data; de_buf = first_de; limit = de_buf + i_size_read(dir); rec_len = OCFS2_DIR_REC_LEN(namelen); while (de_buf < limit) { de = (struct ocfs2_dir_entry *)de_buf; if (!ocfs2_check_dir_entry(dir, de, di_bh, first_de, i_size_read(dir), offset)) { ret = -ENOENT; goto out; } if (ocfs2_match(namelen, name, de)) { ret = -EEXIST; goto out; } /* * No need to check for a trailing dirent record here as * they're not used for inline dirs. */ if (ocfs2_dirent_would_fit(de, rec_len)) { /* Ok, we found a spot. Return this bh and let * the caller actually fill it in. */ *ret_de_bh = di_bh; get_bh(*ret_de_bh); ret = 0; goto out; } last_de = de; de_buf += le16_to_cpu(de->rec_len); offset += le16_to_cpu(de->rec_len); } /* * We're going to require expansion of the directory - figure * out how many blocks we'll need so that a place for the * dirent can be found. */ *blocks_wanted = 1; new_rec_len = le16_to_cpu(last_de->rec_len) + free_space; if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) *blocks_wanted = 2; ret = -ENOSPC; out: return ret; } static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, int namelen, struct buffer_head **ret_de_bh) { unsigned long offset; struct buffer_head *bh = NULL; unsigned short rec_len; struct ocfs2_dir_entry *de; struct super_block *sb = dir->i_sb; int status; int blocksize = dir->i_sb->s_blocksize; status = ocfs2_read_dir_block(dir, 0, &bh, 0); if (status) goto bail; rec_len = OCFS2_DIR_REC_LEN(namelen); offset = 0; de = (struct ocfs2_dir_entry *) bh->b_data; while (1) { if ((char *)de >= sb->s_blocksize + bh->b_data) { brelse(bh); bh = NULL; if (i_size_read(dir) <= offset) { /* * Caller will have to expand this * directory. */ status = -ENOSPC; goto bail; } status = ocfs2_read_dir_block(dir, offset >> sb->s_blocksize_bits, &bh, 0); if (status) goto bail; /* move to next block */ de = (struct ocfs2_dir_entry *) bh->b_data; } if (!ocfs2_check_dir_entry(dir, de, bh, bh->b_data, blocksize, offset)) { status = -ENOENT; goto bail; } if (ocfs2_match(namelen, name, de)) { status = -EEXIST; goto bail; } if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize, blocksize)) goto next; if (ocfs2_dirent_would_fit(de, rec_len)) { /* Ok, we found a spot. Return this bh and let * the caller actually fill it in. */ *ret_de_bh = bh; get_bh(*ret_de_bh); status = 0; goto bail; } next: offset += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); } bail: brelse(bh); if (status) mlog_errno(status); return status; } static int dx_leaf_sort_cmp(const void *a, const void *b) { const struct ocfs2_dx_entry *entry1 = a; const struct ocfs2_dx_entry *entry2 = b; u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash); u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash); u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash); u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash); if (major_hash1 > major_hash2) return 1; if (major_hash1 < major_hash2) return -1; /* * It is not strictly necessary to sort by minor */ if (minor_hash1 > minor_hash2) return 1; if (minor_hash1 < minor_hash2) return -1; return 0; } static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) { struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; int i, num = le16_to_cpu(dl_list->de_num_used); for (i = 0; i < (num - 1); i++) { if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) != le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash)) return 0; } return 1; } /* * Find the optimal value to split this leaf on. This expects the leaf * entries to be in sorted order. * * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is * the hash we want to insert. * * This function is only concerned with the major hash - that which * determines which cluster an item belongs to. */ static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf, u32 leaf_cpos, u32 insert_hash, u32 *split_hash) { struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list; int i, num_used = le16_to_cpu(dl_list->de_num_used); int allsame; /* * There's a couple rare, but nasty corner cases we have to * check for here. All of them involve a leaf where all value * have the same hash, which is what we look for first. * * Most of the time, all of the above is false, and we simply * pick the median value for a split. */ allsame = ocfs2_dx_leaf_same_major(dx_leaf); if (allsame) { u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash); if (val == insert_hash) { /* * No matter where we would choose to split, * the new entry would want to occupy the same * block as these. Since there's no space left * in their existing block, we know there * won't be space after the split. */ return -ENOSPC; } if (val == leaf_cpos) { /* * Because val is the same as leaf_cpos (which * is the smallest value this leaf can have), * yet is not equal to insert_hash, then we * know that insert_hash *must* be larger than * val (and leaf_cpos). At least cpos+1 in value. * * We also know then, that there cannot be an * adjacent extent (otherwise we'd be looking * at it). Choosing this value gives us a * chance to get some contiguousness. */ *split_hash = leaf_cpos + 1; return 0; } if (val > insert_hash) { /* * val can not be the same as insert hash, and * also must be larger than leaf_cpos. Also, * we know that there can't be a leaf between * cpos and val, otherwise the entries with * hash 'val' would be there. */ *split_hash = val; return 0; } *split_hash = insert_hash; return 0; } /* * Since the records are sorted and the checks above * guaranteed that not all records in this block are the same, * we simple travel forward, from the median, and pick the 1st * record whose value is larger than leaf_cpos. */ for (i = (num_used / 2); i < num_used; i++) if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) > leaf_cpos) break; BUG_ON(i == num_used); /* Should be impossible */ *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash); return 0; } /* * Transfer all entries in orig_dx_leaves whose major hash is equal to or * larger than split_hash into new_dx_leaves. We use a temporary * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks. * * Since the block offset inside a leaf (cluster) is a constant mask * of minor_hash, we can optimize - an item at block offset X within * the original cluster, will be at offset X within the new cluster. */ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, handle_t *handle, struct ocfs2_dx_leaf *tmp_dx_leaf, struct buffer_head **orig_dx_leaves, struct buffer_head **new_dx_leaves, int num_dx_leaves) { int i, j, num_used; u32 major_hash; struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; struct ocfs2_dx_entry_list *orig_list, *tmp_list; struct ocfs2_dx_entry *dx_entry; tmp_list = &tmp_dx_leaf->dl_list; for (i = 0; i < num_dx_leaves; i++) { orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; orig_list = &orig_dx_leaf->dl_list; new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; num_used = le16_to_cpu(orig_list->de_num_used); memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize); tmp_list->de_num_used = cpu_to_le16(0); memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used); for (j = 0; j < num_used; j++) { dx_entry = &orig_list->de_entries[j]; major_hash = le32_to_cpu(dx_entry->dx_major_hash); if (major_hash >= split_hash) ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf, dx_entry); else ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf, dx_entry); } memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize); ocfs2_journal_dirty(handle, orig_dx_leaves[i]); ocfs2_journal_dirty(handle, new_dx_leaves[i]); } } static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb, struct ocfs2_dx_root_block *dx_root) { int credits = ocfs2_clusters_to_blocks(osb->sb, 3); credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list); credits += ocfs2_quota_trans_credits(osb->sb); return credits; } /* * Find the median value in dx_leaf_bh and allocate a new leaf to move * half our entries into. */ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *dx_root_bh, struct buffer_head *dx_leaf_bh, struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos, u64 leaf_blkno) { struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; int credits, ret, i, num_used, did_quota = 0; u32 cpos, split_hash, insert_hash = hinfo->major_hash; u64 orig_leaves_start; int num_dx_leaves; struct buffer_head **orig_dx_leaves = NULL; struct buffer_head **new_dx_leaves = NULL; struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL; struct ocfs2_extent_tree et; handle_t *handle = NULL; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)leaf_blkno, insert_hash); ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; /* * XXX: This is a rather large limit. We should use a more * realistic value. */ if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX) return -ENOSPC; num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used); if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) { mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: " "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)leaf_blkno, num_used); ret = -EIO; goto out; } orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); if (!orig_dx_leaves) { ret = -ENOMEM; mlog_errno(ret); goto out; } new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL); if (!new_dx_leaves) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; mlog_errno(ret); goto out; } ret = dquot_alloc_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); if (ret) goto out_commit; did_quota = 1; ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } /* * This block is changing anyway, so we can sort it in place. */ sort(dx_leaf->dl_list.de_entries, num_used, sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, NULL); ocfs2_journal_dirty(handle, dx_leaf_bh); ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, &split_hash); if (ret) { mlog_errno(ret); goto out_commit; } trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash); /* * We have to carefully order operations here. There are items * which want to be in the new cluster before insert, but in * order to put those items in the new cluster, we alter the * old cluster. A failure to insert gets nasty. * * So, start by reserving writes to the old * cluster. ocfs2_dx_dir_new_cluster will reserve writes on * the new cluster for us, before inserting it. The insert * won't happen if there's an error before that. Once the * insert is done then, we can transfer from one leaf into the * other without fear of hitting any error. */ /* * The leaf transfer wants some scratch space so that we don't * wind up doing a bunch of expensive memmove(). */ tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS); if (!tmp_dx_leaf) { ret = -ENOMEM; mlog_errno(ret); goto out_commit; } orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno); ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves, orig_dx_leaves); if (ret) { mlog_errno(ret); goto out_commit; } cpos = split_hash; ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle, data_ac, meta_ac, new_dx_leaves, num_dx_leaves); if (ret) { mlog_errno(ret); goto out_commit; } for (i = 0; i < num_dx_leaves; i++) { ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), orig_dx_leaves[i], OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), new_dx_leaves[i], OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } } ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf, orig_dx_leaves, new_dx_leaves, num_dx_leaves); out_commit: if (ret < 0 && did_quota) dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_commit_trans(osb, handle); out: if (orig_dx_leaves || new_dx_leaves) { for (i = 0; i < num_dx_leaves; i++) { if (orig_dx_leaves) brelse(orig_dx_leaves[i]); if (new_dx_leaves) brelse(new_dx_leaves[i]); } kfree(orig_dx_leaves); kfree(new_dx_leaves); } if (meta_ac) ocfs2_free_alloc_context(meta_ac); if (data_ac) ocfs2_free_alloc_context(data_ac); kfree(tmp_dx_leaf); return ret; } static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dx_root_bh, const char *name, int namelen, struct ocfs2_dir_lookup_result *lookup) { int ret, rebalanced = 0; struct ocfs2_dx_root_block *dx_root; struct buffer_head *dx_leaf_bh = NULL; struct ocfs2_dx_leaf *dx_leaf; u64 blkno; u32 leaf_cpos; dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; restart_search: ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo, &leaf_cpos, &blkno); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh); if (ret) { mlog_errno(ret); goto out; } dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data; if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >= le16_to_cpu(dx_leaf->dl_list.de_count)) { if (rebalanced) { /* * Rebalancing should have provided us with * space in an appropriate leaf. * * XXX: Is this an abnormal condition then? * Should we print a message here? */ ret = -ENOSPC; goto out; } ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh, &lookup->dl_hinfo, leaf_cpos, blkno); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } /* * Restart the lookup. The rebalance might have * changed which block our item fits into. Mark our * progress, so we only execute this once. */ brelse(dx_leaf_bh); dx_leaf_bh = NULL; rebalanced = 1; goto restart_search; } lookup->dl_dx_leaf_bh = dx_leaf_bh; dx_leaf_bh = NULL; out: brelse(dx_leaf_bh); return ret; } static int ocfs2_search_dx_free_list(struct inode *dir, struct buffer_head *dx_root_bh, int namelen, struct ocfs2_dir_lookup_result *lookup) { int ret = -ENOSPC; struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL; struct ocfs2_dir_block_trailer *db; u64 next_block; int rec_len = OCFS2_DIR_REC_LEN(namelen); struct ocfs2_dx_root_block *dx_root; dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; next_block = le64_to_cpu(dx_root->dr_free_blk); while (next_block) { brelse(prev_leaf_bh); prev_leaf_bh = leaf_bh; leaf_bh = NULL; ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh); if (ret) { mlog_errno(ret); goto out; } db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb); if (rec_len <= le16_to_cpu(db->db_free_rec_len)) { lookup->dl_leaf_bh = leaf_bh; lookup->dl_prev_leaf_bh = prev_leaf_bh; leaf_bh = NULL; prev_leaf_bh = NULL; break; } next_block = le64_to_cpu(db->db_free_next); } if (!next_block) ret = -ENOSPC; out: brelse(leaf_bh); brelse(prev_leaf_bh); return ret; } static int ocfs2_expand_inline_dx_root(struct inode *dir, struct buffer_head *dx_root_bh) { int ret, num_dx_leaves, i, j, did_quota = 0; struct buffer_head **dx_leaves = NULL; struct ocfs2_extent_tree et; u64 insert_blkno; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); handle_t *handle = NULL; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dx_entry_list *entry_list; struct ocfs2_dx_entry *dx_entry; struct ocfs2_dx_leaf *target_leaf; ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); goto out; } dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves); if (!dx_leaves) { ret = -ENOMEM; mlog_errno(ret); goto out; } handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = dquot_alloc_space_nodirty(dir, ocfs2_clusters_to_bytes(osb->sb, 1)); if (ret) goto out_commit; did_quota = 1; /* * We do this up front, before the allocation, so that a * failure to add the dx_root_bh to the journal won't result * us losing clusters. */ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves, num_dx_leaves, &insert_blkno); if (ret) { mlog_errno(ret); goto out_commit; } /* * Transfer the entries from our dx_root into the appropriate * block */ dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; entry_list = &dx_root->dr_entries; for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) { dx_entry = &entry_list->de_entries[i]; j = __ocfs2_dx_dir_hash_idx(osb, le32_to_cpu(dx_entry->dx_minor_hash)); target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data; ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry); /* Each leaf has been passed to the journal already * via __ocfs2_dx_dir_new_cluster() */ } dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE; memset(&dx_root->dr_list, 0, osb->sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_list)); dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); /* This should never fail considering we start with an empty * dx_root. */ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL); if (ret) mlog_errno(ret); did_quota = 0; ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dx_root_bh); out_commit: if (ret < 0 && did_quota) dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); out: if (data_ac) ocfs2_free_alloc_context(data_ac); if (dx_leaves) { for (i = 0; i < num_dx_leaves; i++) brelse(dx_leaves[i]); kfree(dx_leaves); } return ret; } static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh) { struct ocfs2_dx_root_block *dx_root; struct ocfs2_dx_entry_list *entry_list; dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; entry_list = &dx_root->dr_entries; if (le16_to_cpu(entry_list->de_num_used) >= le16_to_cpu(entry_list->de_count)) return -ENOSPC; return 0; } static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir, struct buffer_head *di_bh, const char *name, int namelen, struct ocfs2_dir_lookup_result *lookup) { int ret, free_dx_root = 1; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct buffer_head *dx_root_bh = NULL; struct buffer_head *leaf_bh = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dx_root_block *dx_root; ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); if (ret) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) { ret = -ENOSPC; mlog_errno(ret); goto out; } if (ocfs2_dx_root_inline(dx_root)) { ret = ocfs2_inline_dx_has_space(dx_root_bh); if (ret == 0) goto search_el; /* * We ran out of room in the root block. Expand it to * an extent, then allow ocfs2_find_dir_space_dx to do * the rest. */ ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh); if (ret) { mlog_errno(ret); goto out; } } /* * Insert preparation for an indexed directory is split into two * steps. The call to find_dir_space_dx reserves room in the index for * an additional item. If we run out of space there, it's a real error * we can't continue on. */ ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name, namelen, lookup); if (ret) { mlog_errno(ret); goto out; } search_el: /* * Next, we need to find space in the unindexed tree. This call * searches using the free space linked list. If the unindexed tree * lacks sufficient space, we'll expand it below. The expansion code * is smart enough to add any new blocks to the free space list. */ ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup); if (ret && ret != -ENOSPC) { mlog_errno(ret); goto out; } /* Do this up here - ocfs2_extend_dir might need the dx_root */ lookup->dl_dx_root_bh = dx_root_bh; free_dx_root = 0; if (ret == -ENOSPC) { ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh); if (ret) { mlog_errno(ret); goto out; } /* * We make the assumption here that new leaf blocks are added * to the front of our free list. */ lookup->dl_prev_leaf_bh = NULL; lookup->dl_leaf_bh = leaf_bh; } out: if (free_dx_root) brelse(dx_root_bh); return ret; } /* * Get a directory ready for insert. Any directory allocation required * happens here. Success returns zero, and enough context in the dir * lookup result that ocfs2_add_entry() will be able complete the task * with minimal performance impact. */ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, const char *name, int namelen, struct ocfs2_dir_lookup_result *lookup) { int ret; unsigned int blocks_wanted = 1; struct buffer_head *bh = NULL; trace_ocfs2_prepare_dir_for_insert( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); /* * Do this up front to reduce confusion. * * The directory might start inline, then be turned into an * indexed one, in which case we'd need to hash deep inside * ocfs2_find_dir_space_id(). Since * ocfs2_prepare_dx_dir_for_insert() also needs this hash * done, there seems no point in spreading out the calls. We * can optimize away the case where the file system doesn't * support indexing. */ if (ocfs2_supports_indexed_dirs(osb)) ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo); if (ocfs2_dir_indexed(dir)) { ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh, name, namelen, lookup); if (ret) mlog_errno(ret); goto out; } if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name, namelen, &bh, &blocks_wanted); } else ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh); if (ret && ret != -ENOSPC) { mlog_errno(ret); goto out; } if (ret == -ENOSPC) { /* * We have to expand the directory to add this name. */ BUG_ON(bh); ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted, lookup, &bh); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } BUG_ON(!bh); } lookup->dl_leaf_bh = bh; bh = NULL; out: brelse(bh); return ret; } static int ocfs2_dx_dir_remove_index(struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dx_root_bh) { int ret; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dx_root_block *dx_root; struct inode *dx_alloc_inode = NULL; struct buffer_head *dx_alloc_bh = NULL; handle_t *handle; u64 blk; u16 bit; u64 bg_blkno; dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data; dx_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(dx_root->dr_suballoc_slot)); if (!dx_alloc_inode) { ret = -ENOMEM; mlog_errno(ret); goto out; } inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { mlog_errno(ret); goto out_mutex; } handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, di_bh); blk = le64_to_cpu(dx_root->dr_blkno); bit = le16_to_cpu(dx_root->dr_suballoc_bit); if (dx_root->dr_suballoc_loc) bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc); else bg_blkno = ocfs2_which_suballoc_group(blk, bit); ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, bit, bg_blkno, 1); if (ret) mlog_errno(ret); out_commit: ocfs2_commit_trans(osb, handle); out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); return ret; } int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) { int ret; unsigned int clen; u32 major_hash = UINT_MAX, p_cpos, cpos; u64 blkno; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; ocfs2_init_dealloc_ctxt(&dealloc); if (!ocfs2_dir_indexed(dir)) return 0; ret = ocfs2_read_dx_root(dir, di, &dx_root_bh); if (ret) { mlog_errno(ret); goto out; } dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; if (ocfs2_dx_root_inline(dx_root)) goto remove_index; ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); /* XXX: What if dr_clusters is too large? */ while (le32_to_cpu(dx_root->dr_clusters)) { ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list, major_hash, &cpos, &blkno, &clen); if (ret) { mlog_errno(ret); goto out; } p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, &dealloc, 0, false); if (ret) { mlog_errno(ret); goto out; } if (cpos == 0) break; major_hash = cpos - 1; } remove_index: ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh); if (ret) { mlog_errno(ret); goto out; } ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh); out: ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); brelse(dx_root_bh); return ret; }
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" #include "msft.h" #define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 #define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 #define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; __u8 evt_prefix[]; } __packed; #define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 #define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data