Total coverage: 266022 (17%)of 1567129
3 3 1 1 1 1 3 3 3 3 2 1 2 2 1 1 3 3 3 7 6 6 6 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 // SPDX-License-Identifier: GPL-2.0-only /* * Sony NFC Port-100 Series driver * Copyright (c) 2013, Intel Corporation. * * Partly based/Inspired by Stephen Tiedemann's nfcpy */ #include <linux/module.h> #include <linux/usb.h> #include <net/nfc/digital.h> #define VERSION "0.1" #define SONY_VENDOR_ID 0x054c #define RCS380S_PRODUCT_ID 0x06c1 #define RCS380P_PRODUCT_ID 0x06c3 #define PORT100_PROTOCOLS (NFC_PROTO_JEWEL_MASK | \ NFC_PROTO_MIFARE_MASK | \ NFC_PROTO_FELICA_MASK | \ NFC_PROTO_NFC_DEP_MASK | \ NFC_PROTO_ISO14443_MASK | \ NFC_PROTO_ISO14443_B_MASK) #define PORT100_CAPABILITIES (NFC_DIGITAL_DRV_CAPS_IN_CRC | \ NFC_DIGITAL_DRV_CAPS_TG_CRC) /* Standard port100 frame definitions */ #define PORT100_FRAME_HEADER_LEN (sizeof(struct port100_frame) \ + 2) /* data[0] CC, data[1] SCC */ #define PORT100_FRAME_TAIL_LEN 2 /* data[len] DCS, data[len + 1] postamble*/ #define PORT100_COMM_RF_HEAD_MAX_LEN (sizeof(struct port100_tg_comm_rf_cmd)) /* * Max extended frame payload len, excluding CC and SCC * which are already in PORT100_FRAME_HEADER_LEN. */ #define PORT100_FRAME_MAX_PAYLOAD_LEN 1001 #define PORT100_FRAME_ACK_SIZE 6 /* Preamble (1), SoPC (2), ACK Code (2), Postamble (1) */ static u8 ack_frame[PORT100_FRAME_ACK_SIZE] = { 0x00, 0x00, 0xff, 0x00, 0xff, 0x00 }; #define PORT100_FRAME_CHECKSUM(f) (f->data[le16_to_cpu(f->datalen)]) #define PORT100_FRAME_POSTAMBLE(f) (f->data[le16_to_cpu(f->datalen) + 1]) /* start of frame */ #define PORT100_FRAME_SOF 0x00FF #define PORT100_FRAME_EXT 0xFFFF #define PORT100_FRAME_ACK 0x00FF /* Port-100 command: in or out */ #define PORT100_FRAME_DIRECTION(f) (f->data[0]) /* CC */ #define PORT100_FRAME_DIR_OUT 0xD6 #define PORT100_FRAME_DIR_IN 0xD7 /* Port-100 sub-command */ #define PORT100_FRAME_CMD(f) (f->data[1]) /* SCC */ #define PORT100_CMD_GET_FIRMWARE_VERSION 0x20 #define PORT100_CMD_GET_COMMAND_TYPE 0x28 #define PORT100_CMD_SET_COMMAND_TYPE 0x2A #define PORT100_CMD_IN_SET_RF 0x00 #define PORT100_CMD_IN_SET_PROTOCOL 0x02 #define PORT100_CMD_IN_COMM_RF 0x04 #define PORT100_CMD_TG_SET_RF 0x40 #define PORT100_CMD_TG_SET_PROTOCOL 0x42 #define PORT100_CMD_TG_SET_RF_OFF 0x46 #define PORT100_CMD_TG_COMM_RF 0x48 #define PORT100_CMD_SWITCH_RF 0x06 #define PORT100_CMD_RESPONSE(cmd) (cmd + 1) #define PORT100_CMD_TYPE_IS_SUPPORTED(mask, cmd_type) \ ((mask) & (0x01 << (cmd_type))) #define PORT100_CMD_TYPE_0 0 #define PORT100_CMD_TYPE_1 1 #define PORT100_CMD_STATUS_OK 0x00 #define PORT100_CMD_STATUS_TIMEOUT 0x80 #define PORT100_MDAA_TGT_HAS_BEEN_ACTIVATED_MASK 0x01 #define PORT100_MDAA_TGT_WAS_ACTIVATED_MASK 0x02 struct port100; typedef void (*port100_send_async_complete_t)(struct port100 *dev, void *arg, struct sk_buff *resp); /* * Setting sets structure for in_set_rf command * * @in_*_set_number: Represent the entry indexes in the port-100 RF Base Table. * This table contains multiple RF setting sets required for RF * communication. * * @in_*_comm_type: Theses fields set the communication type to be used. */ struct port100_in_rf_setting { u8 in_send_set_number; u8 in_send_comm_type; u8 in_recv_set_number; u8 in_recv_comm_type; } __packed; #define PORT100_COMM_TYPE_IN_212F 0x01 #define PORT100_COMM_TYPE_IN_424F 0x02 #define PORT100_COMM_TYPE_IN_106A 0x03 #define PORT100_COMM_TYPE_IN_106B 0x07 static const struct port100_in_rf_setting in_rf_settings[] = { [NFC_DIGITAL_RF_TECH_212F] = { .in_send_set_number = 1, .in_send_comm_type = PORT100_COMM_TYPE_IN_212F, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_212F, }, [NFC_DIGITAL_RF_TECH_424F] = { .in_send_set_number = 1, .in_send_comm_type = PORT100_COMM_TYPE_IN_424F, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_424F, }, [NFC_DIGITAL_RF_TECH_106A] = { .in_send_set_number = 2, .in_send_comm_type = PORT100_COMM_TYPE_IN_106A, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_106A, }, [NFC_DIGITAL_RF_TECH_106B] = { .in_send_set_number = 3, .in_send_comm_type = PORT100_COMM_TYPE_IN_106B, .in_recv_set_number = 15, .in_recv_comm_type = PORT100_COMM_TYPE_IN_106B, }, /* Ensures the array has NFC_DIGITAL_RF_TECH_LAST elements */ [NFC_DIGITAL_RF_TECH_LAST] = { 0 }, }; /** * struct port100_tg_rf_setting - Setting sets structure for tg_set_rf command * * @tg_set_number: Represents the entry index in the port-100 RF Base Table. * This table contains multiple RF setting sets required for RF * communication. this field is used for both send and receive * settings. * * @tg_comm_type: Sets the communication type to be used to send and receive * data. */ struct port100_tg_rf_setting { u8 tg_set_number; u8 tg_comm_type; } __packed; #define PORT100_COMM_TYPE_TG_106A 0x0B #define PORT100_COMM_TYPE_TG_212F 0x0C #define PORT100_COMM_TYPE_TG_424F 0x0D static const struct port100_tg_rf_setting tg_rf_settings[] = { [NFC_DIGITAL_RF_TECH_106A] = { .tg_set_number = 8, .tg_comm_type = PORT100_COMM_TYPE_TG_106A, }, [NFC_DIGITAL_RF_TECH_212F] = { .tg_set_number = 8, .tg_comm_type = PORT100_COMM_TYPE_TG_212F, }, [NFC_DIGITAL_RF_TECH_424F] = { .tg_set_number = 8, .tg_comm_type = PORT100_COMM_TYPE_TG_424F, }, /* Ensures the array has NFC_DIGITAL_RF_TECH_LAST elements */ [NFC_DIGITAL_RF_TECH_LAST] = { 0 }, }; #define PORT100_IN_PROT_INITIAL_GUARD_TIME 0x00 #define PORT100_IN_PROT_ADD_CRC 0x01 #define PORT100_IN_PROT_CHECK_CRC 0x02 #define PORT100_IN_PROT_MULTI_CARD 0x03 #define PORT100_IN_PROT_ADD_PARITY 0x04 #define PORT100_IN_PROT_CHECK_PARITY 0x05 #define PORT100_IN_PROT_BITWISE_AC_RECV_MODE 0x06 #define PORT100_IN_PROT_VALID_BIT_NUMBER 0x07 #define PORT100_IN_PROT_CRYPTO1 0x08 #define PORT100_IN_PROT_ADD_SOF 0x09 #define PORT100_IN_PROT_CHECK_SOF 0x0A #define PORT100_IN_PROT_ADD_EOF 0x0B #define PORT100_IN_PROT_CHECK_EOF 0x0C #define PORT100_IN_PROT_DEAF_TIME 0x0E #define PORT100_IN_PROT_CRM 0x0F #define PORT100_IN_PROT_CRM_MIN_LEN 0x10 #define PORT100_IN_PROT_T1_TAG_FRAME 0x11 #define PORT100_IN_PROT_RFCA 0x12 #define PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR 0x13 #define PORT100_IN_PROT_END 0x14 #define PORT100_IN_MAX_NUM_PROTOCOLS 19 #define PORT100_TG_PROT_TU 0x00 #define PORT100_TG_PROT_RF_OFF 0x01 #define PORT100_TG_PROT_CRM 0x02 #define PORT100_TG_PROT_END 0x03 #define PORT100_TG_MAX_NUM_PROTOCOLS 3 struct port100_protocol { u8 number; u8 value; } __packed; static const struct port100_protocol in_protocols[][PORT100_IN_MAX_NUM_PROTOCOLS + 1] = { [NFC_DIGITAL_FRAMING_NFCA_SHORT] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 6 }, { PORT100_IN_PROT_ADD_CRC, 0 }, { PORT100_IN_PROT_CHECK_CRC, 0 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 1 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 7 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 6 }, { PORT100_IN_PROT_ADD_CRC, 0 }, { PORT100_IN_PROT_CHECK_CRC, 0 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 1 }, { PORT100_IN_PROT_CHECK_PARITY, 1 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 6 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 1 }, { PORT100_IN_PROT_CHECK_PARITY, 1 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T1T] = { /* nfc_digital_framing_nfca_short */ { PORT100_IN_PROT_ADD_CRC, 2 }, { PORT100_IN_PROT_CHECK_CRC, 2 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_T1_TAG_FRAME, 2 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T2T] = { /* nfc_digital_framing_nfca_standard */ { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 0 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T4T] = { /* nfc_digital_framing_nfca_standard_with_crc_a */ { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_NFC_DEP] = { /* nfc_digital_framing_nfca_standard */ { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 18 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 0 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_T3T] = { /* nfc_digital_framing_nfcf */ { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_NFC_DEP] = { /* nfc_digital_framing_nfcf */ { PORT100_IN_PROT_INITIAL_GUARD_TIME, 18 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 0 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 0 }, { PORT100_IN_PROT_CHECK_SOF, 0 }, { PORT100_IN_PROT_ADD_EOF, 0 }, { PORT100_IN_PROT_CHECK_EOF, 0 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFC_DEP_ACTIVATED] = { { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCB] = { { PORT100_IN_PROT_INITIAL_GUARD_TIME, 20 }, { PORT100_IN_PROT_ADD_CRC, 1 }, { PORT100_IN_PROT_CHECK_CRC, 1 }, { PORT100_IN_PROT_MULTI_CARD, 0 }, { PORT100_IN_PROT_ADD_PARITY, 0 }, { PORT100_IN_PROT_CHECK_PARITY, 0 }, { PORT100_IN_PROT_BITWISE_AC_RECV_MODE, 0 }, { PORT100_IN_PROT_VALID_BIT_NUMBER, 8 }, { PORT100_IN_PROT_CRYPTO1, 0 }, { PORT100_IN_PROT_ADD_SOF, 1 }, { PORT100_IN_PROT_CHECK_SOF, 1 }, { PORT100_IN_PROT_ADD_EOF, 1 }, { PORT100_IN_PROT_CHECK_EOF, 1 }, { PORT100_IN_PROT_DEAF_TIME, 4 }, { PORT100_IN_PROT_CRM, 0 }, { PORT100_IN_PROT_CRM_MIN_LEN, 0 }, { PORT100_IN_PROT_T1_TAG_FRAME, 0 }, { PORT100_IN_PROT_RFCA, 0 }, { PORT100_IN_PROT_GUARD_TIME_AT_INITIATOR, 6 }, { PORT100_IN_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCB_T4T] = { /* nfc_digital_framing_nfcb */ { PORT100_IN_PROT_END, 0 }, }, /* Ensures the array has NFC_DIGITAL_FRAMING_LAST elements */ [NFC_DIGITAL_FRAMING_LAST] = { { PORT100_IN_PROT_END, 0 }, }, }; static const struct port100_protocol tg_protocols[][PORT100_TG_MAX_NUM_PROTOCOLS + 1] = { [NFC_DIGITAL_FRAMING_NFCA_SHORT] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T1T] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_T2T] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCA_NFC_DEP] = { { PORT100_TG_PROT_TU, 1 }, { PORT100_TG_PROT_RF_OFF, 0 }, { PORT100_TG_PROT_CRM, 7 }, { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_T3T] = { { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFCF_NFC_DEP] = { { PORT100_TG_PROT_TU, 1 }, { PORT100_TG_PROT_RF_OFF, 0 }, { PORT100_TG_PROT_CRM, 7 }, { PORT100_TG_PROT_END, 0 }, }, [NFC_DIGITAL_FRAMING_NFC_DEP_ACTIVATED] = { { PORT100_TG_PROT_RF_OFF, 1 }, { PORT100_TG_PROT_END, 0 }, }, /* Ensures the array has NFC_DIGITAL_FRAMING_LAST elements */ [NFC_DIGITAL_FRAMING_LAST] = { { PORT100_TG_PROT_END, 0 }, }, }; struct port100 { struct nfc_digital_dev *nfc_digital_dev; int skb_headroom; int skb_tailroom; struct usb_device *udev; struct usb_interface *interface; struct urb *out_urb; struct urb *in_urb; /* This mutex protects the out_urb and avoids to submit a new command * through port100_send_frame_async() while the previous one is being * canceled through port100_abort_cmd(). */ struct mutex out_urb_lock; struct work_struct cmd_complete_work; u8 cmd_type; /* The digital stack serializes commands to be sent. There is no need * for any queuing/locking mechanism at driver level. */ struct port100_cmd *cmd; bool cmd_cancel; struct completion cmd_cancel_done; }; struct port100_cmd { u8 code; int status; struct sk_buff *req; struct sk_buff *resp; int resp_len; port100_send_async_complete_t complete_cb; void *complete_cb_context; }; struct port100_frame { u8 preamble; __be16 start_frame; __be16 extended_frame; __le16 datalen; u8 datalen_checksum; u8 data[]; } __packed; struct port100_ack_frame { u8 preamble; __be16 start_frame; __be16 ack_frame; u8 postambule; } __packed; struct port100_cb_arg { nfc_digital_cmd_complete_t complete_cb; void *complete_arg; u8 mdaa; }; struct port100_tg_comm_rf_cmd { __le16 guard_time; __le16 send_timeout; u8 mdaa; u8 nfca_param[6]; u8 nfcf_param[18]; u8 mf_halted; u8 arae_flag; __le16 recv_timeout; u8 data[]; } __packed; struct port100_tg_comm_rf_res { u8 comm_type; u8 ar_status; u8 target_activated; __le32 status; u8 data[]; } __packed; /* The rule: value + checksum = 0 */ static inline u8 port100_checksum(u16 value) { return ~(((u8 *)&value)[0] + ((u8 *)&value)[1]) + 1; } /* The rule: sum(data elements) + checksum = 0 */ static u8 port100_data_checksum(const u8 *data, int datalen) { u8 sum = 0; int i; for (i = 0; i < datalen; i++) sum += data[i]; return port100_checksum(sum); } static void port100_tx_frame_init(void *_frame, u8 cmd_code) { struct port100_frame *frame = _frame; frame->preamble = 0; frame->start_frame = cpu_to_be16(PORT100_FRAME_SOF); frame->extended_frame = cpu_to_be16(PORT100_FRAME_EXT); PORT100_FRAME_DIRECTION(frame) = PORT100_FRAME_DIR_OUT; PORT100_FRAME_CMD(frame) = cmd_code; frame->datalen = cpu_to_le16(2); } static void port100_tx_frame_finish(void *_frame) { struct port100_frame *frame = _frame; frame->datalen_checksum = port100_checksum(le16_to_cpu(frame->datalen)); PORT100_FRAME_CHECKSUM(frame) = port100_data_checksum(frame->data, le16_to_cpu(frame->datalen)); PORT100_FRAME_POSTAMBLE(frame) = 0; } static void port100_tx_update_payload_len(void *_frame, int len) { struct port100_frame *frame = _frame; le16_add_cpu(&frame->datalen, len); } static bool port100_rx_frame_is_valid(const void *_frame) { u8 checksum; const struct port100_frame *frame = _frame; if (frame->start_frame != cpu_to_be16(PORT100_FRAME_SOF) || frame->extended_frame != cpu_to_be16(PORT100_FRAME_EXT)) return false; checksum = port100_checksum(le16_to_cpu(frame->datalen)); if (checksum != frame->datalen_checksum) return false; checksum = port100_data_checksum(frame->data, le16_to_cpu(frame->datalen)); if (checksum != PORT100_FRAME_CHECKSUM(frame)) return false; return true; } static bool port100_rx_frame_is_ack(const struct port100_ack_frame *frame) { return (frame->start_frame == cpu_to_be16(PORT100_FRAME_SOF) && frame->ack_frame == cpu_to_be16(PORT100_FRAME_ACK)); } static inline int port100_rx_frame_size(const void *frame) { const struct port100_frame *f = frame; return sizeof(struct port100_frame) + le16_to_cpu(f->datalen) + PORT100_FRAME_TAIL_LEN; } static bool port100_rx_frame_is_cmd_response(const struct port100 *dev, const void *frame) { const struct port100_frame *f = frame; return (PORT100_FRAME_CMD(f) == PORT100_CMD_RESPONSE(dev->cmd->code)); } static void port100_recv_response(struct urb *urb) { struct port100 *dev = urb->context; struct port100_cmd *cmd = dev->cmd; u8 *in_frame; cmd->status = urb->status; switch (urb->status) { case 0: break; /* success */ case -ECONNRESET: case -ENOENT: nfc_dbg(&dev->interface->dev, "The urb has been canceled (status %d)\n", urb->status); goto sched_wq; case -ESHUTDOWN: default: nfc_err(&dev->interface->dev, "Urb failure (status %d)\n", urb->status); goto sched_wq; } in_frame = dev->in_urb->transfer_buffer; if (!port100_rx_frame_is_valid(in_frame)) { nfc_err(&dev->interface->dev, "Received an invalid frame\n"); cmd->status = -EIO; goto sched_wq; } print_hex_dump_debug("PORT100 RX: ", DUMP_PREFIX_NONE, 16, 1, in_frame, port100_rx_frame_size(in_frame), false); if (!port100_rx_frame_is_cmd_response(dev, in_frame)) { nfc_err(&dev->interface->dev, "It's not the response to the last command\n"); cmd->status = -EIO; goto sched_wq; } sched_wq: schedule_work(&dev->cmd_complete_work); } static int port100_submit_urb_for_response(const struct port100 *dev, gfp_t flags) { dev->in_urb->complete = port100_recv_response; return usb_submit_urb(dev->in_urb, flags); } static void port100_recv_ack(struct urb *urb) { struct port100 *dev = urb->context; struct port100_cmd *cmd = dev->cmd; const struct port100_ack_frame *in_frame; int rc; cmd->status = urb->status; switch (urb->status) { case 0: break; /* success */ case -ECONNRESET: case -ENOENT: nfc_dbg(&dev->interface->dev, "The urb has been stopped (status %d)\n", urb->status); goto sched_wq; case -ESHUTDOWN: default: nfc_err(&dev->interface->dev, "Urb failure (status %d)\n", urb->status); goto sched_wq; } in_frame = dev->in_urb->transfer_buffer; if (!port100_rx_frame_is_ack(in_frame)) { nfc_err(&dev->interface->dev, "Received an invalid ack\n"); cmd->status = -EIO; goto sched_wq; } rc = port100_submit_urb_for_response(dev, GFP_ATOMIC); if (rc) { nfc_err(&dev->interface->dev, "usb_submit_urb failed with result %d\n", rc); cmd->status = rc; goto sched_wq; } return; sched_wq: schedule_work(&dev->cmd_complete_work); } static int port100_submit_urb_for_ack(const struct port100 *dev, gfp_t flags) { dev->in_urb->complete = port100_recv_ack; return usb_submit_urb(dev->in_urb, flags); } static int port100_send_ack(struct port100 *dev) { int rc = 0; mutex_lock(&dev->out_urb_lock); /* * If prior cancel is in-flight (dev->cmd_cancel == true), we * can skip to send cancel. Then this will wait the prior * cancel, or merged into the next cancel rarely if next * cancel was started before waiting done. In any case, this * will be waked up soon or later. */ if (!dev->cmd_cancel) { reinit_completion(&dev->cmd_cancel_done); usb_kill_urb(dev->out_urb); dev->out_urb->transfer_buffer = ack_frame; dev->out_urb->transfer_buffer_length = sizeof(ack_frame); rc = usb_submit_urb(dev->out_urb, GFP_KERNEL); /* * Set the cmd_cancel flag only if the URB has been * successfully submitted. It will be reset by the out * URB completion callback port100_send_complete(). */ dev->cmd_cancel = !rc; } mutex_unlock(&dev->out_urb_lock); if (!rc) wait_for_completion(&dev->cmd_cancel_done); return rc; } static int port100_send_frame_async(struct port100 *dev, const struct sk_buff *out, const struct sk_buff *in, int in_len) { int rc; mutex_lock(&dev->out_urb_lock); /* A command cancel frame as been sent through dev->out_urb. Don't try * to submit a new one. */ if (dev->cmd_cancel) { rc = -EAGAIN; goto exit; } dev->out_urb->transfer_buffer = out->data; dev->out_urb->transfer_buffer_length = out->len; dev->in_urb->transfer_buffer = in->data; dev->in_urb->transfer_buffer_length = in_len; print_hex_dump_debug("PORT100 TX: ", DUMP_PREFIX_NONE, 16, 1, out->data, out->len, false); rc = usb_submit_urb(dev->out_urb, GFP_KERNEL); if (rc) goto exit; rc = port100_submit_urb_for_ack(dev, GFP_KERNEL); if (rc) usb_kill_urb(dev->out_urb); exit: mutex_unlock(&dev->out_urb_lock); return rc; } static void port100_build_cmd_frame(struct port100 *dev, u8 cmd_code, struct sk_buff *skb) { /* payload is already there, just update datalen */ int payload_len = skb->len; skb_push(skb, PORT100_FRAME_HEADER_LEN); skb_put(skb, PORT100_FRAME_TAIL_LEN); port100_tx_frame_init(skb->data, cmd_code); port100_tx_update_payload_len(skb->data, payload_len); port100_tx_frame_finish(skb->data); } static void port100_send_async_complete(struct port100 *dev) { struct port100_cmd *cmd = dev->cmd; int status = cmd->status; struct sk_buff *req = cmd->req; struct sk_buff *resp = cmd->resp; dev_kfree_skb(req); dev->cmd = NULL; if (status < 0) { cmd->complete_cb(dev, cmd->complete_cb_context, ERR_PTR(status)); dev_kfree_skb(resp); goto done; } skb_put(resp, port100_rx_frame_size(resp->data)); skb_pull(resp, PORT100_FRAME_HEADER_LEN); skb_trim(resp, resp->len - PORT100_FRAME_TAIL_LEN); cmd->complete_cb(dev, cmd->complete_cb_context, resp); done: kfree(cmd); } static int port100_send_cmd_async(struct port100 *dev, u8 cmd_code, struct sk_buff *req, port100_send_async_complete_t complete_cb, void *complete_cb_context) { struct port100_cmd *cmd; struct sk_buff *resp; int rc; int resp_len = PORT100_FRAME_HEADER_LEN + PORT100_FRAME_MAX_PAYLOAD_LEN + PORT100_FRAME_TAIL_LEN; if (dev->cmd) { nfc_err(&dev->interface->dev, "A command is still in process\n"); return -EBUSY; } resp = alloc_skb(resp_len, GFP_KERNEL); if (!resp) return -ENOMEM; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) { dev_kfree_skb(resp); return -ENOMEM; } cmd->code = cmd_code; cmd->req = req; cmd->resp = resp; cmd->resp_len = resp_len; cmd->complete_cb = complete_cb; cmd->complete_cb_context = complete_cb_context; port100_build_cmd_frame(dev, cmd_code, req); dev->cmd = cmd; rc = port100_send_frame_async(dev, req, resp, resp_len); if (rc) { kfree(cmd); dev_kfree_skb(resp); dev->cmd = NULL; } return rc; } struct port100_sync_cmd_response { struct sk_buff *resp; struct completion done; }; static void port100_wq_cmd_complete(struct work_struct *work) { struct port100 *dev = container_of(work, struct port100, cmd_complete_work); port100_send_async_complete(dev); } static void port100_send_sync_complete(struct port100 *dev, void *_arg, struct sk_buff *resp) { struct port100_sync_cmd_response *arg = _arg; arg->resp = resp; complete(&arg->done); } static struct sk_buff *port100_send_cmd_sync(struct port100 *dev, u8 cmd_code, struct sk_buff *req) { int rc; struct port100_sync_cmd_response arg; init_completion(&arg.done); rc = port100_send_cmd_async(dev, cmd_code, req, port100_send_sync_complete, &arg); if (rc) { dev_kfree_skb(req); return ERR_PTR(rc); } wait_for_completion(&arg.done); return arg.resp; } static void port100_send_complete(struct urb *urb) { struct port100 *dev = urb->context; if (dev->cmd_cancel) { complete_all(&dev->cmd_cancel_done); dev->cmd_cancel = false; } switch (urb->status) { case 0: break; /* success */ case -ECONNRESET: case -ENOENT: nfc_dbg(&dev->interface->dev, "The urb has been stopped (status %d)\n", urb->status); break; case -ESHUTDOWN: default: nfc_err(&dev->interface->dev, "Urb failure (status %d)\n", urb->status); } } static void port100_abort_cmd(struct nfc_digital_dev *ddev) { struct port100 *dev = nfc_digital_get_drvdata(ddev); /* An ack will cancel the last issued command */ port100_send_ack(dev); /* cancel the urb request */ usb_kill_urb(dev->in_urb); } static struct sk_buff *port100_alloc_skb(const struct port100 *dev, unsigned int size) { struct sk_buff *skb; skb = alloc_skb(dev->skb_headroom + dev->skb_tailroom + size, GFP_KERNEL); if (skb) skb_reserve(skb, dev->skb_headroom); return skb; } static int port100_set_command_type(struct port100 *dev, u8 command_type) { struct sk_buff *skb; struct sk_buff *resp; int rc; skb = port100_alloc_skb(dev, 1); if (!skb) return -ENOMEM; skb_put_u8(skb, command_type); resp = port100_send_cmd_sync(dev, PORT100_CMD_SET_COMMAND_TYPE, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static u64 port100_get_command_type_mask(struct port100 *dev) { struct sk_buff *skb; struct sk_buff *resp; u64 mask; skb = port100_alloc_skb(dev, 0); if (!skb) return 0; resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_COMMAND_TYPE, skb); if (IS_ERR(resp)) return 0; if (resp->len < 8) mask = 0; else mask = be64_to_cpu(*(__be64 *)resp->data); dev_kfree_skb(resp); return mask; } static u16 port100_get_firmware_version(struct port100 *dev) { struct sk_buff *skb; struct sk_buff *resp; u16 fw_ver; skb = port100_alloc_skb(dev, 0); if (!skb) return 0; resp = port100_send_cmd_sync(dev, PORT100_CMD_GET_FIRMWARE_VERSION, skb); if (IS_ERR(resp)) return 0; fw_ver = le16_to_cpu(*(__le16 *)resp->data); dev_kfree_skb(resp); return fw_ver; } static int port100_switch_rf(struct nfc_digital_dev *ddev, bool on) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb, *resp; skb = port100_alloc_skb(dev, 1); if (!skb) return -ENOMEM; skb_put_u8(skb, on ? 1 : 0); /* Cancel the last command if the device is being switched off */ if (!on) port100_abort_cmd(ddev); resp = port100_send_cmd_sync(dev, PORT100_CMD_SWITCH_RF, skb); if (IS_ERR(resp)) return PTR_ERR(resp); dev_kfree_skb(resp); return 0; } static int port100_in_set_rf(struct nfc_digital_dev *ddev, u8 rf) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb; struct sk_buff *resp; int rc; if (rf >= NFC_DIGITAL_RF_TECH_LAST) return -EINVAL; skb = port100_alloc_skb(dev, sizeof(struct port100_in_rf_setting)); if (!skb) return -ENOMEM; skb_put_data(skb, &in_rf_settings[rf], sizeof(struct port100_in_rf_setting)); resp = port100_send_cmd_sync(dev, PORT100_CMD_IN_SET_RF, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_in_set_framing(struct nfc_digital_dev *ddev, int param) { struct port100 *dev = nfc_digital_get_drvdata(ddev); const struct port100_protocol *protocols; struct sk_buff *skb; struct sk_buff *resp; int num_protocols; size_t size; int rc; if (param >= NFC_DIGITAL_FRAMING_LAST) return -EINVAL; protocols = in_protocols[param]; num_protocols = 0; while (protocols[num_protocols].number != PORT100_IN_PROT_END) num_protocols++; if (!num_protocols) return 0; size = sizeof(struct port100_protocol) * num_protocols; skb = port100_alloc_skb(dev, size); if (!skb) return -ENOMEM; skb_put_data(skb, protocols, size); resp = port100_send_cmd_sync(dev, PORT100_CMD_IN_SET_PROTOCOL, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_in_configure_hw(struct nfc_digital_dev *ddev, int type, int param) { if (type == NFC_DIGITAL_CONFIG_RF_TECH) return port100_in_set_rf(ddev, param); if (type == NFC_DIGITAL_CONFIG_FRAMING) return port100_in_set_framing(ddev, param); return -EINVAL; } static void port100_in_comm_rf_complete(struct port100 *dev, void *arg, struct sk_buff *resp) { const struct port100_cb_arg *cb_arg = arg; nfc_digital_cmd_complete_t cb = cb_arg->complete_cb; u32 status; int rc; if (IS_ERR(resp)) { rc = PTR_ERR(resp); goto exit; } if (resp->len < 4) { nfc_err(&dev->interface->dev, "Invalid packet length received\n"); rc = -EIO; goto error; } status = le32_to_cpu(*(__le32 *)resp->data); skb_pull(resp, sizeof(u32)); if (status == PORT100_CMD_STATUS_TIMEOUT) { rc = -ETIMEDOUT; goto error; } if (status != PORT100_CMD_STATUS_OK) { nfc_err(&dev->interface->dev, "in_comm_rf failed with status 0x%08x\n", status); rc = -EIO; goto error; } /* Remove collision bits byte */ skb_pull(resp, 1); goto exit; error: kfree_skb(resp); resp = ERR_PTR(rc); exit: cb(dev->nfc_digital_dev, cb_arg->complete_arg, resp); kfree(cb_arg); } static int port100_in_send_cmd(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 _timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct port100_cb_arg *cb_arg; __le16 timeout; cb_arg = kzalloc(sizeof(struct port100_cb_arg), GFP_KERNEL); if (!cb_arg) return -ENOMEM; cb_arg->complete_cb = cb; cb_arg->complete_arg = arg; timeout = cpu_to_le16(_timeout * 10); memcpy(skb_push(skb, sizeof(__le16)), &timeout, sizeof(__le16)); return port100_send_cmd_async(dev, PORT100_CMD_IN_COMM_RF, skb, port100_in_comm_rf_complete, cb_arg); } static int port100_tg_set_rf(struct nfc_digital_dev *ddev, u8 rf) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb; struct sk_buff *resp; int rc; if (rf >= NFC_DIGITAL_RF_TECH_LAST) return -EINVAL; skb = port100_alloc_skb(dev, sizeof(struct port100_tg_rf_setting)); if (!skb) return -ENOMEM; skb_put_data(skb, &tg_rf_settings[rf], sizeof(struct port100_tg_rf_setting)); resp = port100_send_cmd_sync(dev, PORT100_CMD_TG_SET_RF, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_tg_set_framing(struct nfc_digital_dev *ddev, int param) { struct port100 *dev = nfc_digital_get_drvdata(ddev); const struct port100_protocol *protocols; struct sk_buff *skb; struct sk_buff *resp; int rc; int num_protocols; size_t size; if (param >= NFC_DIGITAL_FRAMING_LAST) return -EINVAL; protocols = tg_protocols[param]; num_protocols = 0; while (protocols[num_protocols].number != PORT100_TG_PROT_END) num_protocols++; if (!num_protocols) return 0; size = sizeof(struct port100_protocol) * num_protocols; skb = port100_alloc_skb(dev, size); if (!skb) return -ENOMEM; skb_put_data(skb, protocols, size); resp = port100_send_cmd_sync(dev, PORT100_CMD_TG_SET_PROTOCOL, skb); if (IS_ERR(resp)) return PTR_ERR(resp); rc = resp->data[0]; dev_kfree_skb(resp); return rc; } static int port100_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) { if (type == NFC_DIGITAL_CONFIG_RF_TECH) return port100_tg_set_rf(ddev, param); if (type == NFC_DIGITAL_CONFIG_FRAMING) return port100_tg_set_framing(ddev, param); return -EINVAL; } static bool port100_tg_target_activated(struct port100 *dev, u8 tgt_activated) { u8 mask; switch (dev->cmd_type) { case PORT100_CMD_TYPE_0: mask = PORT100_MDAA_TGT_HAS_BEEN_ACTIVATED_MASK; break; case PORT100_CMD_TYPE_1: mask = PORT100_MDAA_TGT_HAS_BEEN_ACTIVATED_MASK | PORT100_MDAA_TGT_WAS_ACTIVATED_MASK; break; default: nfc_err(&dev->interface->dev, "Unknown command type\n"); return false; } return ((tgt_activated & mask) == mask); } static void port100_tg_comm_rf_complete(struct port100 *dev, void *arg, struct sk_buff *resp) { u32 status; const struct port100_cb_arg *cb_arg = arg; nfc_digital_cmd_complete_t cb = cb_arg->complete_cb; struct port100_tg_comm_rf_res *hdr; if (IS_ERR(resp)) goto exit; hdr = (struct port100_tg_comm_rf_res *)resp->data; status = le32_to_cpu(hdr->status); if (cb_arg->mdaa && !port100_tg_target_activated(dev, hdr->target_activated)) { kfree_skb(resp); resp = ERR_PTR(-ETIMEDOUT); goto exit; } skb_pull(resp, sizeof(struct port100_tg_comm_rf_res)); if (status != PORT100_CMD_STATUS_OK) { kfree_skb(resp); if (status == PORT100_CMD_STATUS_TIMEOUT) resp = ERR_PTR(-ETIMEDOUT); else resp = ERR_PTR(-EIO); } exit: cb(dev->nfc_digital_dev, cb_arg->complete_arg, resp); kfree(cb_arg); } static int port100_tg_send_cmd(struct nfc_digital_dev *ddev, struct sk_buff *skb, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct port100_tg_comm_rf_cmd *hdr; struct port100_cb_arg *cb_arg; cb_arg = kzalloc(sizeof(struct port100_cb_arg), GFP_KERNEL); if (!cb_arg) return -ENOMEM; cb_arg->complete_cb = cb; cb_arg->complete_arg = arg; skb_push(skb, sizeof(struct port100_tg_comm_rf_cmd)); hdr = (struct port100_tg_comm_rf_cmd *)skb->data; memset(hdr, 0, sizeof(struct port100_tg_comm_rf_cmd)); hdr->guard_time = cpu_to_le16(500); hdr->send_timeout = cpu_to_le16(0xFFFF); hdr->recv_timeout = cpu_to_le16(timeout); return port100_send_cmd_async(dev, PORT100_CMD_TG_COMM_RF, skb, port100_tg_comm_rf_complete, cb_arg); } static int port100_listen_mdaa(struct nfc_digital_dev *ddev, struct digital_tg_mdaa_params *params, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { struct port100 *dev = nfc_digital_get_drvdata(ddev); struct port100_tg_comm_rf_cmd *hdr; struct port100_cb_arg *cb_arg; struct sk_buff *skb; int rc; rc = port100_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, NFC_DIGITAL_RF_TECH_106A); if (rc) return rc; rc = port100_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, NFC_DIGITAL_FRAMING_NFCA_NFC_DEP); if (rc) return rc; cb_arg = kzalloc(sizeof(struct port100_cb_arg), GFP_KERNEL); if (!cb_arg) return -ENOMEM; cb_arg->complete_cb = cb; cb_arg->complete_arg = arg; cb_arg->mdaa = 1; skb = port100_alloc_skb(dev, 0); if (!skb) { kfree(cb_arg); return -ENOMEM; } skb_push(skb, sizeof(struct port100_tg_comm_rf_cmd)); hdr = (struct port100_tg_comm_rf_cmd *)skb->data; memset(hdr, 0, sizeof(struct port100_tg_comm_rf_cmd)); hdr->guard_time = 0; hdr->send_timeout = cpu_to_le16(0xFFFF); hdr->mdaa = 1; hdr->nfca_param[0] = (params->sens_res >> 8) & 0xFF; hdr->nfca_param[1] = params->sens_res & 0xFF; memcpy(hdr->nfca_param + 2, params->nfcid1, 3); hdr->nfca_param[5] = params->sel_res; memcpy(hdr->nfcf_param, params->nfcid2, 8); hdr->nfcf_param[16] = (params->sc >> 8) & 0xFF; hdr->nfcf_param[17] = params->sc & 0xFF; hdr->recv_timeout = cpu_to_le16(timeout); return port100_send_cmd_async(dev, PORT100_CMD_TG_COMM_RF, skb, port100_tg_comm_rf_complete, cb_arg); } static int port100_listen(struct nfc_digital_dev *ddev, u16 timeout, nfc_digital_cmd_complete_t cb, void *arg) { const struct port100 *dev = nfc_digital_get_drvdata(ddev); struct sk_buff *skb; skb = port100_alloc_skb(dev, 0); if (!skb) return -ENOMEM; return port100_tg_send_cmd(ddev, skb, timeout, cb, arg); } static const struct nfc_digital_ops port100_digital_ops = { .in_configure_hw = port100_in_configure_hw, .in_send_cmd = port100_in_send_cmd, .tg_listen_mdaa = port100_listen_mdaa, .tg_listen = port100_listen, .tg_configure_hw = port100_tg_configure_hw, .tg_send_cmd = port100_tg_send_cmd, .switch_rf = port100_switch_rf, .abort_cmd = port100_abort_cmd, }; static const struct usb_device_id port100_table[] = { { USB_DEVICE(SONY_VENDOR_ID, RCS380S_PRODUCT_ID), }, { USB_DEVICE(SONY_VENDOR_ID, RCS380P_PRODUCT_ID), }, { } }; MODULE_DEVICE_TABLE(usb, port100_table); static int port100_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct port100 *dev; int rc; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; int in_endpoint; int out_endpoint; u16 fw_version; u64 cmd_type_mask; int i; dev = devm_kzalloc(&interface->dev, sizeof(struct port100), GFP_KERNEL); if (!dev) return -ENOMEM; mutex_init(&dev->out_urb_lock); dev->udev = usb_get_dev(interface_to_usbdev(interface)); dev->interface = interface; usb_set_intfdata(interface, dev); in_endpoint = out_endpoint = 0; iface_desc = interface->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; if (!in_endpoint && usb_endpoint_is_bulk_in(endpoint)) in_endpoint = endpoint->bEndpointAddress; if (!out_endpoint && usb_endpoint_is_bulk_out(endpoint)) out_endpoint = endpoint->bEndpointAddress; } if (!in_endpoint || !out_endpoint) { nfc_err(&interface->dev, "Could not find bulk-in or bulk-out endpoint\n"); rc = -ENODEV; goto error; } dev->in_urb = usb_alloc_urb(0, GFP_KERNEL); dev->out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->in_urb || !dev->out_urb) { nfc_err(&interface->dev, "Could not allocate USB URBs\n"); rc = -ENOMEM; goto error; } usb_fill_bulk_urb(dev->in_urb, dev->udev, usb_rcvbulkpipe(dev->udev, in_endpoint), NULL, 0, NULL, dev); usb_fill_bulk_urb(dev->out_urb, dev->udev, usb_sndbulkpipe(dev->udev, out_endpoint), NULL, 0, port100_send_complete, dev); dev->out_urb->transfer_flags = URB_ZERO_PACKET; dev->skb_headroom = PORT100_FRAME_HEADER_LEN + PORT100_COMM_RF_HEAD_MAX_LEN; dev->skb_tailroom = PORT100_FRAME_TAIL_LEN; init_completion(&dev->cmd_cancel_done); INIT_WORK(&dev->cmd_complete_work, port100_wq_cmd_complete); /* The first thing to do with the Port-100 is to set the command type * to be used. If supported we use command type 1. 0 otherwise. */ cmd_type_mask = port100_get_command_type_mask(dev); if (!cmd_type_mask) { nfc_err(&interface->dev, "Could not get supported command types\n"); rc = -ENODEV; goto error; } if (PORT100_CMD_TYPE_IS_SUPPORTED(cmd_type_mask, PORT100_CMD_TYPE_1)) dev->cmd_type = PORT100_CMD_TYPE_1; else dev->cmd_type = PORT100_CMD_TYPE_0; rc = port100_set_command_type(dev, dev->cmd_type); if (rc) { nfc_err(&interface->dev, "The device does not support command type %u\n", dev->cmd_type); goto error; } fw_version = port100_get_firmware_version(dev); if (!fw_version) nfc_err(&interface->dev, "Could not get device firmware version\n"); nfc_info(&interface->dev, "Sony NFC Port-100 Series attached (firmware v%x.%02x)\n", (fw_version & 0xFF00) >> 8, fw_version & 0xFF); dev->nfc_digital_dev = nfc_digital_allocate_device(&port100_digital_ops, PORT100_PROTOCOLS, PORT100_CAPABILITIES, dev->skb_headroom, dev->skb_tailroom); if (!dev->nfc_digital_dev) { nfc_err(&interface->dev, "Could not allocate nfc_digital_dev\n"); rc = -ENOMEM; goto error; } nfc_digital_set_parent_dev(dev->nfc_digital_dev, &interface->dev); nfc_digital_set_drvdata(dev->nfc_digital_dev, dev); rc = nfc_digital_register_device(dev->nfc_digital_dev); if (rc) { nfc_err(&interface->dev, "Could not register digital device\n"); goto free_nfc_dev; } return 0; free_nfc_dev: nfc_digital_free_device(dev->nfc_digital_dev); error: usb_kill_urb(dev->in_urb); usb_free_urb(dev->in_urb); usb_kill_urb(dev->out_urb); usb_free_urb(dev->out_urb); usb_put_dev(dev->udev); return rc; } static void port100_disconnect(struct usb_interface *interface) { struct port100 *dev; dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); nfc_digital_unregister_device(dev->nfc_digital_dev); nfc_digital_free_device(dev->nfc_digital_dev); usb_kill_urb(dev->in_urb); usb_kill_urb(dev->out_urb); usb_free_urb(dev->in_urb); usb_free_urb(dev->out_urb); usb_put_dev(dev->udev); kfree(dev->cmd); nfc_info(&interface->dev, "Sony Port-100 NFC device disconnected\n"); } static struct usb_driver port100_driver = { .name = "port100", .probe = port100_probe, .disconnect = port100_disconnect, .id_table = port100_table, }; module_usb_driver(port100_driver); MODULE_DESCRIPTION("NFC Port-100 series usb driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL");
40 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 // SPDX-License-Identifier: GPL-2.0-or-later /* client.c: NFS client sharing and management code * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/xprtrdma.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/idr.h> #include <net/ipv6.h> #include <linux/nfs_xdr.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static DEFINE_SPINLOCK(nfs_version_lock); static DEFINE_MUTEX(nfs_version_mutex); static LIST_HEAD(nfs_versions); /* * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .pipe_dir_name = NFS_PIPE_DIRNAME, }; static struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; spin_lock(&nfs_version_lock); list_for_each_entry(nfs, &nfs_versions, list) { if (nfs->rpc_ops->version == version) { spin_unlock(&nfs_version_lock); return nfs; } } spin_unlock(&nfs_version_lock); return ERR_PTR(-EPROTONOSUPPORT); } struct nfs_subversion *get_nfs_version(unsigned int version) { struct nfs_subversion *nfs = find_nfs_version(version); if (IS_ERR(nfs)) { mutex_lock(&nfs_version_mutex); request_module("nfsv%d", version); nfs = find_nfs_version(version); mutex_unlock(&nfs_version_mutex); } if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) return ERR_PTR(-EAGAIN); return nfs; } void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); } void register_nfs_version(struct nfs_subversion *nfs) { spin_lock(&nfs_version_lock); list_add(&nfs->list, &nfs_versions); nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; spin_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { spin_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; list_del(&nfs->list); spin_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record * * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; int err = -ENOMEM; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!try_module_get(clp->cl_nfs_mod->owner)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; if (cl_init->hostname) { err = -ENOMEM; clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); if (!clp->cl_hostname) goto error_cleanup; } INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ clp->cl_principal = "*"; clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: put_nfs_version(clp->cl_nfs_mod); error_dealloc: kfree(clp); error_0: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); idr_destroy(&nn->cb_ident_idr); } /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->cl_cb_ident) idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); } static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } #else static void nfs_cleanup_cb_ident_idr(struct net *net) { } static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { } static void pnfs_init_server(struct nfs_server *server) { } #endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record */ void nfs_free_client(struct nfs_client *clp) { nfs_local_disable(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); put_net(clp->cl_net); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record */ void nfs_put_client(struct nfs_client *clp) { struct nfs_net *nn; if (!clp) return; nn = net_generic(clp->cl_net, nfs_net_id); if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); clp->rpc_ops->free_client(clp); } } EXPORT_SYMBOL_GPL(nfs_put_client); /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. */ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; /* If a client is still initializing then we need to wait */ if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); goto again; } /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) continue; /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; /* Match request for a dedicated DS */ if (test_bit(NFS_CS_DS, &data->init_flags) != test_bit(NFS_CS_DS, &clp->cl_flags)) continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ if (IS_ERR(clp->cl_rpcclient) || !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, sap)) continue; /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; refcount_inc(&clp->cl_count); return clp; } return NULL; } /* * Return true if @clp is done initializing, false if still working on it. * * Use nfs_client_init_status to check if it was successful. */ bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); /* * Return 0 if @clp was successfully initialized, -errno otherwise. * * This must be called *after* nfs_client_init_is_complete() returns true, * otherwise it will pop WARN_ON_ONCE and return -EINVAL */ int nfs_client_init_status(const struct nfs_client *clp) { /* called without checking nfs_client_init_is_complete */ if (clp->cl_cons_state > NFS_CS_READY) { WARN_ON_ONCE(1); return -EINVAL; } return clp->cl_cons_state; } EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { return wait_event_killable(nfs_client_active_wq, nfs_client_init_is_complete(clp)); } EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); /* * Found an existing client. Make sure it's ready before returning. */ static struct nfs_client * nfs_found_client(const struct nfs_client_initdata *cl_init, struct nfs_client *clp) { int error; error = nfs_wait_client_init_complete(clp); if (error < 0) { nfs_put_client(clp); return ERR_PTR(-ERESTARTSYS); } if (clp->cl_cons_state < NFS_CS_READY) { error = clp->cl_cons_state; nfs_put_client(clp); return ERR_PTR(error); } smp_rmb(); return clp; } /* * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; if (cl_init->hostname == NULL) { WARN_ON(1); return ERR_PTR(-EINVAL); } /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); if (new) new->rpc_ops->free_client(new); if (IS_ERR(clp)) return clp; return nfs_found_client(cl_init, clp); } if (new) { list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) nfs_local_probe(new); return new; } spin_unlock(&nn->nfs_client_lock); new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); /* * Mark a server as ready or failed */ void nfs_mark_client_ready(struct nfs_client *clp, int state) { smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) to->to_maxval = NFS_MAX_TCP_TIMEOUT; if (to->to_maxval < to->to_initval) to->to_maxval = to->to_initval; to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; default: BUG(); } } EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, .connect_timeout = cl_init->connect_timeout, .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (!IS_ERR(clp->cl_rpcclient)) return 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("%s: cannot create RPC client. Error = %ld\n", __func__, PTR_ERR(clnt)); return PTR_ERR(clnt); } clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction */ static void nfs_destroy_server(struct nfs_server *server) { if (server->nlm_host) nlmclnt_done(server->nlm_host); } /* * Version 2 or 3 lockd setup */ static int nfs_start_lockd(struct nfs_server *server) { struct nlm_host *host; struct nfs_client *clp = server->nfs_client; struct nlmclnt_initdata nlm_init = { .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, .cred = server->cred, }; if (nlm_init.nfs_version > 3) return 0; if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { default: nlm_init.protocol = IPPROTO_TCP; break; #ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: nlm_init.protocol = IPPROTO_UDP; #endif } host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); server->nlm_host = host; server->destroy = nfs_destroy_server; nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } /* * Create a general RPC client */ int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { struct nfs_client *clp = server->nfs_client; server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); } memcpy(&server->client->cl_timeout_default, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFTERR) server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); /** * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { int error; /* the client is already initialised */ if (clp->cl_cons_state == NFS_CS_READY) return clp; /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); if (error < 0) { nfs_put_client(clp); clp = ERR_PTR(error); } return clp; } EXPORT_SYMBOL_GPL(nfs_init_client); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, const struct fs_context *fc) { const struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, .net = fc->net_ns, .timeparms = &timeparms, .cred = server->cred, .nconnect = ctx->nfs_server.nconnect, .init_flags = (1UL << NFS_CS_REUSEPORT), .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, ctx->timeo, ctx->retrans); if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); server->nfs_client = clp; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; switch (clp->rpc_ops->version) { case 2: server->fattr_valid = NFS_ATTR_FATTR_V2; break; case 3: server->fattr_valid = NFS_ATTR_FATTR_V3; break; default: server->fattr_valid = NFS_ATTR_FATTR_V4; } if (ctx->rsize) server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); if (ctx->wsize) server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; server->acdirmin = ctx->acdirmin * HZ; server->acdirmax = ctx->acdirmax * HZ; /* Start lockd here, before we might error out */ error = nfs_start_lockd(server); if (error < 0) goto error; server->port = ctx->nfs_server.port; server->auth_info = ctx->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, ctx->selected_flavor); if (error < 0) goto error; /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, ctx->mount_server.addrlen); server->mountd_addrlen = ctx->mount_server.addrlen; } server->mountd_version = ctx->mount_server.version; server->mountd_port = ctx->mount_server.port; server->mountd_protocol = ctx->mount_server.protocol; server->namelen = ctx->namlen; return 0; error: server->nfs_client = NULL; nfs_put_client(clp); return error; } /* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); if (server->dtsize > NFS_MAX_FILE_IO_SIZE) server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; if (server->flags & NFS_MOUNT_NOAC) { server->acregmin = server->acregmax = 0; server->acdirmin = server->acdirmax = 0; } server->maxfilesize = fsinfo->maxfilesize; server->time_delta = fsinfo->time_delta; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); #ifdef CONFIG_NFS_V4_2 /* * Defaults until limited by the session parameters. */ server->gxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->sxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->lxasize = min_t(unsigned int, raw_max_rpc_payload, nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; #endif } /* * Probe filesystem information, including the FSID on v2/v3 */ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; int error; if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) return error; } fsinfo.fattr = fattr; fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) return error; nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { struct nfs_pathconf pathinfo; pathinfo.fattr = fattr; nfs_fattr_init(fattr); if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) server->namelen = pathinfo.max_namelen; } if (clp->rpc_ops->discover_trunking != NULL && (server->caps & NFS_CAP_FS_LOCATIONS && (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; } return 0; } /* * Grab the destination's particulars, including lease expiry time. * * Returns zero if probe succeeded and retrieved FSID matches the FSID * we have cached. */ int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh) { struct nfs_fattr *fattr; int error; fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; /* Sanity: the probe won't work if the destination server * does not recognize the migrated FH. */ error = nfs_probe_fsinfo(server, mntfh, fattr); nfs_free_fattr(fattr); return error; } EXPORT_SYMBOL_GPL(nfs_probe_server); /* * Copy useful information when duplicating a server record */ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; target->rsize = source->rsize; target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nn->nfs_volume_list); clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nn->nfs_client_lock); } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; if (clp == NULL) return; nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); if (list_empty(&clp->cl_superblocks)) set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nn->nfs_client_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); static DEFINE_IDA(s_sysfs_ids); /* * Allocate and initialise a server record */ struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) return NULL; server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); if (server->s_sysfs_id < 0) { kfree(server); return NULL; } server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { kfree(server); return NULL; } server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; init_waitqueue_head(&server->write_congestion_wait); atomic_long_set(&server->writeback, 0); atomic64_set(&server->owner_ctr, 0); pnfs_init_server(server); rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } EXPORT_SYMBOL_GPL(nfs_alloc_server); static void delayed_free(struct rcu_head *p) { struct nfs_server *server = container_of(p, struct nfs_server, rcu); nfs_free_iostats(server->io_stats); kfree(server); } /* * Free up a server record */ void nfs_free_server(struct nfs_server *server) { nfs_server_remove_lists(server); if (server->destroy != NULL) server->destroy(server); if (!IS_ERR(server->client_acl)) rpc_shutdown_client(server->client_acl); if (!IS_ERR(server->client)) rpc_shutdown_client(server->client); nfs_put_client(server->nfs_client); if (server->kobj.state_initialized) { nfs_sysfs_remove_server(server); kobject_put(&server->kobj); } ida_free(&s_sysfs_ids, server->s_sysfs_id); put_cred(server->cred); nfs_release_automount_timer(); call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ struct nfs_server *nfs_create_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto error; /* Get a client representation */ error = nfs_init_server(server, fc); if (error < 0) goto error; /* Probe the root fh to retrieve its FSID */ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr); return server; error: nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record */ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fh *fh, struct nfs_fattr *fattr, rpc_authflavor_t flavor) { struct nfs_server *server; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(source->cred); /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, server->nfs_client->cl_rpcclient, "_state"); error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); if (error < 0) goto out_free_server; /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) goto out_free_server; if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; error = nfs_start_lockd(server); if (error < 0) goto out_free_server; nfs_server_insert_lists(server); server->mount_time = jiffies; return server; out_free_server: nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); void nfs_clients_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); INIT_LIST_HEAD(&nn->nfs_client_list); INIT_LIST_HEAD(&nn->nfs_volume_list); #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } void nfs_clients_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs_netns_sysfs_destroy(nn); nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } #ifdef CONFIG_PROC_FS static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, .show = nfs_server_list_show, }; static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, .show = nfs_volume_list_show, }; /* * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* * move to next server */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } /* * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); /* Check if the client is initialized */ if (clp->cl_cons_state != NFS_CS_READY) return 0; rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); return 0; } /* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* * move to next volume */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } /* * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID" " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, fsid, nfs_server_fscache_state(server)); rcu_read_unlock(); return 0; } int nfs_fs_proc_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct proc_dir_entry *p; nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); if (!nn->proc_nfsfs) goto error_0; /* a file of servers with which we're dealing */ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; error_1: remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { remove_proc_subtree("nfsfs", net->proc_net); } /* * initialise the /proc/fs/nfsfs/ directory */ int __init nfs_fs_proc_init(void) { if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) goto error_1; return 0; error_1: remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } /* * clean up the /proc/fs/nfsfs/ directory */ void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */
8 7 10 10 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> */ #include <linux/hashtable.h> #include <linux/xattr.h> #include "messages.h" #include "props.h" #include "btrfs_inode.h" #include "transaction.h" #include "ctree.h" #include "xattr.h" #include "compression.h" #include "space-info.h" #include "fs.h" #include "accessors.h" #include "super.h" #include "dir-item.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); struct prop_handler { struct hlist_node node; const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(const struct inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) { struct hlist_head *h; h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)]; if (hlist_empty(h)) return NULL; return h; } static const struct prop_handler * find_prop_handler(const char *name, const struct hlist_head *handlers) { struct prop_handler *h; if (!handlers) { u64 hash = btrfs_name_hash(name, strlen(name)); handlers = find_prop_handlers_by_hash(hash); if (!handlers) return NULL; } hlist_for_each_entry(h, handlers, node) if (!strcmp(h->xattr_name, name)) return h; return NULL; } int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len) { const struct prop_handler *handler; if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) return -EINVAL; handler = find_prop_handler(name, NULL); if (!handler) return -EINVAL; if (value_len == 0) return 0; return handler->validate(inode, value, value_len); } /* * Check if a property should be ignored (not set) for an inode. * * @inode: The target inode. * @name: The property's name. * * The caller must be sure the given property name is valid, for example by * having previously called btrfs_validate_prop(). * * Returns: true if the property should be ignored for the given inode * false if the property must not be ignored for the given inode */ bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) { const struct prop_handler *handler; handler = find_prop_handler(name, NULL); ASSERT(handler != NULL); return handler->ignore(inode); } int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, const char *name, const char *value, size_t value_len, int flags) { const struct prop_handler *handler; int ret; handler = find_prop_handler(name, NULL); if (!handler) return -EINVAL; if (value_len == 0) { ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); if (ret) return ret; ret = handler->apply(&inode->vfs_inode, NULL, 0); ASSERT(ret == 0); return ret; } ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, value, value_len, flags); if (ret) return ret; ret = handler->apply(&inode->vfs_inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); return ret; } set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); return 0; } static int iterate_object_props(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, void (*iterator)(void *, const struct prop_handler *, const char *, size_t), void *ctx) { int ret; char *name_buf = NULL; char *value_buf = NULL; int name_buf_len = 0; int value_buf_len = 0; while (1) { struct btrfs_key key; struct btrfs_dir_item *di; struct extent_buffer *leaf; u32 total_len, cur, this_len; int slot; const struct hlist_head *handlers; slot = path->slots[0]; leaf = path->nodes[0]; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; else if (ret > 0) break; continue; } btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid) break; if (key.type != BTRFS_XATTR_ITEM_KEY) break; handlers = find_prop_handlers_by_hash(key.offset); if (!handlers) goto next_slot; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); cur = 0; total_len = btrfs_item_size(leaf, slot); while (cur < total_len) { u32 name_len = btrfs_dir_name_len(leaf, di); u32 data_len = btrfs_dir_data_len(leaf, di); unsigned long name_ptr, data_ptr; const struct prop_handler *handler; this_len = sizeof(*di) + name_len + data_len; name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, XATTR_BTRFS_PREFIX_LEN)) goto next_dir_item; if (name_len >= name_buf_len) { kfree(name_buf); name_buf_len = name_len + 1; name_buf = kmalloc(name_buf_len, GFP_NOFS); if (!name_buf) { ret = -ENOMEM; goto out; } } read_extent_buffer(leaf, name_buf, name_ptr, name_len); name_buf[name_len] = '\0'; handler = find_prop_handler(name_buf, handlers); if (!handler) goto next_dir_item; if (data_len > value_buf_len) { kfree(value_buf); value_buf_len = data_len; value_buf = kmalloc(data_len, GFP_NOFS); if (!value_buf) { ret = -ENOMEM; goto out; } } read_extent_buffer(leaf, value_buf, data_ptr, data_len); iterator(ctx, handler, value_buf, data_len); next_dir_item: cur += this_len; di = (struct btrfs_dir_item *)((char *) di + this_len); } next_slot: path->slots[0]++; } ret = 0; out: btrfs_release_path(path); kfree(name_buf); kfree(value_buf); return ret; } static void inode_prop_iterator(void *ctx, const struct prop_handler *handler, const char *value, size_t len) { struct inode *inode = ctx; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; ret = handler->apply(inode, value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", handler->xattr_name, btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); else set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 ino = btrfs_ino(BTRFS_I(inode)); return iterate_object_props(root, path, ino, inode_prop_iterator, inode); } static int prop_compression_validate(const struct btrfs_inode *inode, const char *value, size_t len) { if (!btrfs_inode_can_compress(inode)) return -EINVAL; if (!value) return 0; if (btrfs_compress_is_valid_type(value, len)) return 0; if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) return 0; return -EINVAL; } static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int type; /* Reset to defaults */ if (len == 0) { BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; return 0; } if (!strncmp("lzo", value, 3)) { type = BTRFS_COMPRESS_LZO; btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; } else if (!strncmp("zstd", value, 4)) { type = BTRFS_COMPRESS_ZSTD; btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } else { return -EINVAL; } BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; BTRFS_I(inode)->prop_compress = type; return 0; } static bool prop_compression_ignore(const struct btrfs_inode *inode) { /* * Compression only has effect for regular files, and for directories * we set it just to propagate it to new files created inside them. * Everything else (symlinks, devices, sockets, fifos) is pointless as * it will do nothing, so don't waste metadata space on a compression * xattr for anything that is neither a file nor a directory. */ if (!S_ISREG(inode->vfs_inode.i_mode) && !S_ISDIR(inode->vfs_inode.i_mode)) return true; return false; } static const char *prop_compression_extract(const struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); default: break; } return NULL; } static struct prop_handler prop_handlers[] = { { .xattr_name = XATTR_BTRFS_PREFIX "compression", .validate = prop_compression_validate, .apply = prop_compression_apply, .extract = prop_compression_extract, .ignore = prop_compression_ignore, .inheritable = 1 }, }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, const struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; if (!test_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(parent)->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { const struct prop_handler *h = &prop_handlers[i]; const char *value; u64 num_bytes = 0; if (!h->inheritable) continue; if (h->ignore(BTRFS_I(inode))) continue; value = h->extract(parent); if (!value) continue; /* * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ ret = h->validate(BTRFS_I(inode), value, strlen(value)); if (ret) continue; /* * Currently callers should be reserving 1 item for properties, * since we only have 1 property that we currently support. If * we add more in the future we need to try and reserve more * space for them. But we should also revisit how we do space * reservations if we do add more properties in the future. */ if (need_reserve) { num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); ret = btrfs_block_rsv_add(fs_info, trans->block_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); if (ret) return ret; } ret = btrfs_setxattr(trans, inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) btrfs_setxattr(trans, inode, h->xattr_name, NULL, 0, 0); else set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } if (need_reserve) { btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes, NULL); if (ret) return ret; } need_reserve = true; } return 0; } int __init btrfs_props_init(void) { int i; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { struct prop_handler *p = &prop_handlers[i]; u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); hash_add(prop_handlers_ht, &p->node, h); } return 0; }
11 1 11 11 11 8 8 8 8 8 8 8 8 195 195 195 195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 // SPDX-License-Identifier: GPL-2.0-or-later /* */ #include <linux/gfp.h> #include <linux/init.h> #include <linux/ratelimit.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <linux/slab.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "usbaudio.h" #include "helper.h" #include "card.h" #include "endpoint.h" #include "pcm.h" #include "clock.h" #include "quirks.h" enum { EP_STATE_STOPPED, EP_STATE_RUNNING, EP_STATE_STOPPING, }; /* interface refcounting */ struct snd_usb_iface_ref { unsigned char iface; bool need_setup; int opened; int altset; struct list_head list; }; /* clock refcounting */ struct snd_usb_clock_ref { unsigned char clock; atomic_t locked; int opened; int rate; bool need_setup; struct list_head list; }; /* * snd_usb_endpoint is a model that abstracts everything related to an * USB endpoint and its streaming. * * There are functions to activate and deactivate the streaming URBs and * optional callbacks to let the pcm logic handle the actual content of the * packets for playback and record. Thus, the bus streaming and the audio * handlers are fully decoupled. * * There are two different types of endpoints in audio applications. * * SND_USB_ENDPOINT_TYPE_DATA handles full audio data payload for both * inbound and outbound traffic. * * SND_USB_ENDPOINT_TYPE_SYNC endpoints are for inbound traffic only and * expect the payload to carry Q10.14 / Q16.16 formatted sync information * (3 or 4 bytes). * * Each endpoint has to be configured prior to being used by calling * snd_usb_endpoint_set_params(). * * The model incorporates a reference counting, so that multiple users * can call snd_usb_endpoint_start() and snd_usb_endpoint_stop(), and * only the first user will effectively start the URBs, and only the last * one to stop it will tear the URBs down again. */ /* * convert a sampling rate into our full speed format (fs/1000 in Q16.16) * this will overflow at approx 524 kHz */ static inline unsigned get_usb_full_speed_rate(unsigned int rate) { return ((rate << 13) + 62) / 125; } /* * convert a sampling rate into USB high speed format (fs/8000 in Q16.16) * this will overflow at approx 4 MHz */ static inline unsigned get_usb_high_speed_rate(unsigned int rate) { return ((rate << 10) + 62) / 125; } /* * release a urb data */ static void release_urb_ctx(struct snd_urb_ctx *u) { if (u->urb && u->buffer_size) usb_free_coherent(u->ep->chip->dev, u->buffer_size, u->urb->transfer_buffer, u->urb->transfer_dma); usb_free_urb(u->urb); u->urb = NULL; u->buffer_size = 0; } static const char *usb_error_string(int err) { switch (err) { case -ENODEV: return "no device"; case -ENOENT: return "endpoint not enabled"; case -EPIPE: return "endpoint stalled"; case -ENOSPC: return "not enough bandwidth"; case -ESHUTDOWN: return "device disabled"; case -EHOSTUNREACH: return "device suspended"; case -EINVAL: case -EAGAIN: case -EFBIG: case -EMSGSIZE: return "internal error"; default: return "unknown error"; } } static inline bool ep_state_running(struct snd_usb_endpoint *ep) { return atomic_read(&ep->state) == EP_STATE_RUNNING; } static inline bool ep_state_update(struct snd_usb_endpoint *ep, int old, int new) { return atomic_try_cmpxchg(&ep->state, &old, new); } /** * snd_usb_endpoint_implicit_feedback_sink: Report endpoint usage type * * @ep: The snd_usb_endpoint * * Determine whether an endpoint is driven by an implicit feedback * data endpoint source. */ int snd_usb_endpoint_implicit_feedback_sink(struct snd_usb_endpoint *ep) { return ep->implicit_fb_sync && usb_pipeout(ep->pipe); } /* * Return the number of samples to be sent in the next packet * for streaming based on information derived from sync endpoints * * This won't be used for implicit feedback which takes the packet size * returned from the sync source */ static int slave_next_packet_size(struct snd_usb_endpoint *ep, unsigned int avail) { unsigned long flags; unsigned int phase; int ret; if (ep->fill_max) return ep->maxframesize; spin_lock_irqsave(&ep->lock, flags); phase = (ep->phase & 0xffff) + (ep->freqm << ep->datainterval); ret = min(phase >> 16, ep->maxframesize); if (avail && ret >= avail) ret = -EAGAIN; else ep->phase = phase; spin_unlock_irqrestore(&ep->lock, flags); return ret; } /* * Return the number of samples to be sent in the next packet * for adaptive and synchronous endpoints */ static int next_packet_size(struct snd_usb_endpoint *ep, unsigned int avail) { unsigned int sample_accum; int ret; if (ep->fill_max) return ep->maxframesize; sample_accum = ep->sample_accum + ep->sample_rem; if (sample_accum >= ep->pps) { sample_accum -= ep->pps; ret = ep->packsize[1]; } else { ret = ep->packsize[0]; } if (avail && ret >= avail) ret = -EAGAIN; else ep->sample_accum = sample_accum; return ret; } /* * snd_usb_endpoint_next_packet_size: Return the number of samples to be sent * in the next packet * * If the size is equal or exceeds @avail, don't proceed but return -EAGAIN * Exception: @avail = 0 for skipping the check. */ int snd_usb_endpoint_next_packet_size(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx, int idx, unsigned int avail) { unsigned int packet; packet = ctx->packet_size[idx]; if (packet) { if (avail && packet >= avail) return -EAGAIN; return packet; } if (ep->sync_source) return slave_next_packet_size(ep, avail); else return next_packet_size(ep, avail); } static void call_retire_callback(struct snd_usb_endpoint *ep, struct urb *urb) { struct snd_usb_substream *data_subs; data_subs = READ_ONCE(ep->data_subs); if (data_subs && ep->retire_data_urb) ep->retire_data_urb(data_subs, urb); } static void retire_outbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *urb_ctx) { call_retire_callback(ep, urb_ctx->urb); } static void snd_usb_handle_sync_urb(struct snd_usb_endpoint *ep, struct snd_usb_endpoint *sender, const struct urb *urb); static void retire_inbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *urb_ctx) { struct urb *urb = urb_ctx->urb; struct snd_usb_endpoint *sync_sink; if (unlikely(ep->skip_packets > 0)) { ep->skip_packets--; return; } sync_sink = READ_ONCE(ep->sync_sink); if (sync_sink) snd_usb_handle_sync_urb(sync_sink, ep, urb); call_retire_callback(ep, urb); } static inline bool has_tx_length_quirk(struct snd_usb_audio *chip) { return chip->quirk_flags & QUIRK_FLAG_TX_LENGTH; } static void prepare_silent_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx) { struct urb *urb = ctx->urb; unsigned int offs = 0; unsigned int extra = 0; __le32 packet_length; int i; /* For tx_length_quirk, put packet length at start of packet */ if (has_tx_length_quirk(ep->chip)) extra = sizeof(packet_length); for (i = 0; i < ctx->packets; ++i) { unsigned int offset; unsigned int length; int counts; counts = snd_usb_endpoint_next_packet_size(ep, ctx, i, 0); length = counts * ep->stride; /* number of silent bytes */ offset = offs * ep->stride + extra * i; urb->iso_frame_desc[i].offset = offset; urb->iso_frame_desc[i].length = length + extra; if (extra) { packet_length = cpu_to_le32(length); memcpy(urb->transfer_buffer + offset, &packet_length, sizeof(packet_length)); } memset(urb->transfer_buffer + offset + extra, ep->silence_value, length); offs += counts; } urb->number_of_packets = ctx->packets; urb->transfer_buffer_length = offs * ep->stride + ctx->packets * extra; ctx->queued = 0; } /* * Prepare a PLAYBACK urb for submission to the bus. */ static int prepare_outbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx, bool in_stream_lock) { struct urb *urb = ctx->urb; unsigned char *cp = urb->transfer_buffer; struct snd_usb_substream *data_subs; urb->dev = ep->chip->dev; /* we need to set this at each time */ switch (ep->type) { case SND_USB_ENDPOINT_TYPE_DATA: data_subs = READ_ONCE(ep->data_subs); if (data_subs && ep->prepare_data_urb) return ep->prepare_data_urb(data_subs, urb, in_stream_lock); /* no data provider, so send silence */ prepare_silent_urb(ep, ctx); break; case SND_USB_ENDPOINT_TYPE_SYNC: if (snd_usb_get_speed(ep->chip->dev) >= USB_SPEED_HIGH) { /* * fill the length and offset of each urb descriptor. * the fixed 12.13 frequency is passed as 16.16 through the pipe. */ urb->iso_frame_desc[0].length = 4; urb->iso_frame_desc[0].offset = 0; cp[0] = ep->freqn; cp[1] = ep->freqn >> 8; cp[2] = ep->freqn >> 16; cp[3] = ep->freqn >> 24; } else { /* * fill the length and offset of each urb descriptor. * the fixed 10.14 frequency is passed through the pipe. */ urb->iso_frame_desc[0].length = 3; urb->iso_frame_desc[0].offset = 0; cp[0] = ep->freqn >> 2; cp[1] = ep->freqn >> 10; cp[2] = ep->freqn >> 18; } break; } return 0; } /* * Prepare a CAPTURE or SYNC urb for submission to the bus. */ static int prepare_inbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *urb_ctx) { int i, offs; struct urb *urb = urb_ctx->urb; urb->dev = ep->chip->dev; /* we need to set this at each time */ switch (ep->type) { case SND_USB_ENDPOINT_TYPE_DATA: offs = 0; for (i = 0; i < urb_ctx->packets; i++) { urb->iso_frame_desc[i].offset = offs; urb->iso_frame_desc[i].length = ep->curpacksize; offs += ep->curpacksize; } urb->transfer_buffer_length = offs; urb->number_of_packets = urb_ctx->packets; break; case SND_USB_ENDPOINT_TYPE_SYNC: urb->iso_frame_desc[0].length = min(4u, ep->syncmaxsize); urb->iso_frame_desc[0].offset = 0; break; } return 0; } /* notify an error as XRUN to the assigned PCM data substream */ static void notify_xrun(struct snd_usb_endpoint *ep) { struct snd_usb_substream *data_subs; data_subs = READ_ONCE(ep->data_subs); if (data_subs && data_subs->pcm_substream) snd_pcm_stop_xrun(data_subs->pcm_substream); } static struct snd_usb_packet_info * next_packet_fifo_enqueue(struct snd_usb_endpoint *ep) { struct snd_usb_packet_info *p; p = ep->next_packet + (ep->next_packet_head + ep->next_packet_queued) % ARRAY_SIZE(ep->next_packet); ep->next_packet_queued++; return p; } static struct snd_usb_packet_info * next_packet_fifo_dequeue(struct snd_usb_endpoint *ep) { struct snd_usb_packet_info *p; p = ep->next_packet + ep->next_packet_head; ep->next_packet_head++; ep->next_packet_head %= ARRAY_SIZE(ep->next_packet); ep->next_packet_queued--; return p; } static void push_back_to_ready_list(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx) { unsigned long flags; spin_lock_irqsave(&ep->lock, flags); list_add_tail(&ctx->ready_list, &ep->ready_playback_urbs); spin_unlock_irqrestore(&ep->lock, flags); } /* * Send output urbs that have been prepared previously. URBs are dequeued * from ep->ready_playback_urbs and in case there aren't any available * or there are no packets that have been prepared, this function does * nothing. * * The reason why the functionality of sending and preparing URBs is separated * is that host controllers don't guarantee the order in which they return * inbound and outbound packets to their submitters. * * This function is used both for implicit feedback endpoints and in low- * latency playback mode. */ int snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, bool in_stream_lock) { bool implicit_fb = snd_usb_endpoint_implicit_feedback_sink(ep); while (ep_state_running(ep)) { unsigned long flags; struct snd_usb_packet_info *packet; struct snd_urb_ctx *ctx = NULL; int err, i; spin_lock_irqsave(&ep->lock, flags); if ((!implicit_fb || ep->next_packet_queued > 0) && !list_empty(&ep->ready_playback_urbs)) { /* take URB out of FIFO */ ctx = list_first_entry(&ep->ready_playback_urbs, struct snd_urb_ctx, ready_list); list_del_init(&ctx->ready_list); if (implicit_fb) packet = next_packet_fifo_dequeue(ep); } spin_unlock_irqrestore(&ep->lock, flags); if (ctx == NULL) break; /* copy over the length information */ if (implicit_fb) { for (i = 0; i < packet->packets; i++) ctx->packet_size[i] = packet->packet_size[i]; } /* call the data handler to fill in playback data */ err = prepare_outbound_urb(ep, ctx, in_stream_lock); /* can be stopped during prepare callback */ if (unlikely(!ep_state_running(ep))) break; if (err < 0) { /* push back to ready list again for -EAGAIN */ if (err == -EAGAIN) { push_back_to_ready_list(ep, ctx); break; } if (!in_stream_lock) notify_xrun(ep); return -EPIPE; } if (!atomic_read(&ep->chip->shutdown)) err = usb_submit_urb(ctx->urb, GFP_ATOMIC); else err = -ENODEV; if (err < 0) { if (!atomic_read(&ep->chip->shutdown)) { usb_audio_err(ep->chip, "Unable to submit urb #%d: %d at %s\n", ctx->index, err, __func__); if (!in_stream_lock) notify_xrun(ep); } return -EPIPE; } set_bit(ctx->index, &ep->active_mask); atomic_inc(&ep->submitted_urbs); } return 0; } /* * complete callback for urbs */ static void snd_complete_urb(struct urb *urb) { struct snd_urb_ctx *ctx = urb->context; struct snd_usb_endpoint *ep = ctx->ep; int err; if (unlikely(urb->status == -ENOENT || /* unlinked */ urb->status == -ENODEV || /* device removed */ urb->status == -ECONNRESET || /* unlinked */ urb->status == -ESHUTDOWN)) /* device disabled */ goto exit_clear; /* device disconnected */ if (unlikely(atomic_read(&ep->chip->shutdown))) goto exit_clear; if (unlikely(!ep_state_running(ep))) goto exit_clear; if (usb_pipeout(ep->pipe)) { retire_outbound_urb(ep, ctx); /* can be stopped during retire callback */ if (unlikely(!ep_state_running(ep))) goto exit_clear; /* in low-latency and implicit-feedback modes, push back the * URB to ready list at first, then process as much as possible */ if (ep->lowlatency_playback || snd_usb_endpoint_implicit_feedback_sink(ep)) { push_back_to_ready_list(ep, ctx); clear_bit(ctx->index, &ep->active_mask); snd_usb_queue_pending_output_urbs(ep, false); atomic_dec(&ep->submitted_urbs); /* decrement at last */ return; } /* in non-lowlatency mode, no error handling for prepare */ prepare_outbound_urb(ep, ctx, false); /* can be stopped during prepare callback */ if (unlikely(!ep_state_running(ep))) goto exit_clear; } else { retire_inbound_urb(ep, ctx); /* can be stopped during retire callback */ if (unlikely(!ep_state_running(ep))) goto exit_clear; prepare_inbound_urb(ep, ctx); } if (!atomic_read(&ep->chip->shutdown)) err = usb_submit_urb(urb, GFP_ATOMIC); else err = -ENODEV; if (err == 0) return; if (!atomic_read(&ep->chip->shutdown)) { usb_audio_err(ep->chip, "cannot submit urb (err = %d)\n", err); notify_xrun(ep); } exit_clear: clear_bit(ctx->index, &ep->active_mask); atomic_dec(&ep->submitted_urbs); } /* * Find or create a refcount object for the given interface * * The objects are released altogether in snd_usb_endpoint_free_all() */ static struct snd_usb_iface_ref * iface_ref_find(struct snd_usb_audio *chip, int iface) { struct snd_usb_iface_ref *ip; list_for_each_entry(ip, &chip->iface_ref_list, list) if (ip->iface == iface) return ip; ip = kzalloc(sizeof(*ip), GFP_KERNEL); if (!ip) return NULL; ip->iface = iface; list_add_tail(&ip->list, &chip->iface_ref_list); return ip; } /* Similarly, a refcount object for clock */ static struct snd_usb_clock_ref * clock_ref_find(struct snd_usb_audio *chip, int clock) { struct snd_usb_clock_ref *ref; list_for_each_entry(ref, &chip->clock_ref_list, list) if (ref->clock == clock) return ref; ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!ref) return NULL; ref->clock = clock; atomic_set(&ref->locked, 0); list_add_tail(&ref->list, &chip->clock_ref_list); return ref; } /* * Get the existing endpoint object corresponding EP * Returns NULL if not present. */ struct snd_usb_endpoint * snd_usb_get_endpoint(struct snd_usb_audio *chip, int ep_num) { struct snd_usb_endpoint *ep; list_for_each_entry(ep, &chip->ep_list, list) { if (ep->ep_num == ep_num) return ep; } return NULL; } #define ep_type_name(type) \ (type == SND_USB_ENDPOINT_TYPE_DATA ? "data" : "sync") /** * snd_usb_add_endpoint: Add an endpoint to an USB audio chip * * @chip: The chip * @ep_num: The number of the endpoint to use * @type: SND_USB_ENDPOINT_TYPE_DATA or SND_USB_ENDPOINT_TYPE_SYNC * * If the requested endpoint has not been added to the given chip before, * a new instance is created. * * Returns zero on success or a negative error code. * * New endpoints will be added to chip->ep_list and freed by * calling snd_usb_endpoint_free_all(). * * For SND_USB_ENDPOINT_TYPE_SYNC, the caller needs to guarantee that * bNumEndpoints > 1 beforehand. */ int snd_usb_add_endpoint(struct snd_usb_audio *chip, int ep_num, int type) { struct snd_usb_endpoint *ep; bool is_playback; ep = snd_usb_get_endpoint(chip, ep_num); if (ep) return 0; usb_audio_dbg(chip, "Creating new %s endpoint #%x\n", ep_type_name(type), ep_num); ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (!ep) return -ENOMEM; ep->chip = chip; spin_lock_init(&ep->lock); ep->type = type; ep->ep_num = ep_num; INIT_LIST_HEAD(&ep->ready_playback_urbs); atomic_set(&ep->submitted_urbs, 0); is_playback = ((ep_num & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT); ep_num &= USB_ENDPOINT_NUMBER_MASK; if (is_playback) ep->pipe = usb_sndisocpipe(chip->dev, ep_num); else ep->pipe = usb_rcvisocpipe(chip->dev, ep_num); list_add_tail(&ep->list, &chip->ep_list); return 0; } /* Set up syncinterval and maxsyncsize for a sync EP */ static void endpoint_set_syncinterval(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { struct usb_host_interface *alts; struct usb_endpoint_descriptor *desc; alts = snd_usb_get_host_interface(chip, ep->iface, ep->altsetting); if (!alts) return; desc = get_endpoint(alts, ep->ep_idx); if (desc->bLength >= USB_DT_ENDPOINT_AUDIO_SIZE && desc->bRefresh >= 1 && desc->bRefresh <= 9) ep->syncinterval = desc->bRefresh; else if (snd_usb_get_speed(chip->dev) == USB_SPEED_FULL) ep->syncinterval = 1; else if (desc->bInterval >= 1 && desc->bInterval <= 16) ep->syncinterval = desc->bInterval - 1; else ep->syncinterval = 3; ep->syncmaxsize = le16_to_cpu(desc->wMaxPacketSize); } static bool endpoint_compatible(struct snd_usb_endpoint *ep, const struct audioformat *fp, const struct snd_pcm_hw_params *params) { if (!ep->opened) return false; if (ep->cur_audiofmt != fp) return false; if (ep->cur_rate != params_rate(params) || ep->cur_format != params_format(params) || ep->cur_period_frames != params_period_size(params) || ep->cur_buffer_periods != params_periods(params)) return false; return true; } /* * Check whether the given fp and hw params are compatible with the current * setup of the target EP for implicit feedback sync */ bool snd_usb_endpoint_compatible(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep, const struct audioformat *fp, const struct snd_pcm_hw_params *params) { bool ret; mutex_lock(&chip->mutex); ret = endpoint_compatible(ep, fp, params); mutex_unlock(&chip->mutex); return ret; } /* * snd_usb_endpoint_open: Open the endpoint * * Called from hw_params to assign the endpoint to the substream. * It's reference-counted, and only the first opener is allowed to set up * arbitrary parameters. The later opener must be compatible with the * former opened parameters. * The endpoint needs to be closed via snd_usb_endpoint_close() later. * * Note that this function doesn't configure the endpoint. The substream * needs to set it up later via snd_usb_endpoint_set_params() and * snd_usb_endpoint_prepare(). */ struct snd_usb_endpoint * snd_usb_endpoint_open(struct snd_usb_audio *chip, const struct audioformat *fp, const struct snd_pcm_hw_params *params, bool is_sync_ep, bool fixed_rate) { struct snd_usb_endpoint *ep; int ep_num = is_sync_ep ? fp->sync_ep : fp->endpoint; mutex_lock(&chip->mutex); ep = snd_usb_get_endpoint(chip, ep_num); if (!ep) { usb_audio_err(chip, "Cannot find EP 0x%x to open\n", ep_num); goto unlock; } if (!ep->opened) { if (is_sync_ep) { ep->iface = fp->sync_iface; ep->altsetting = fp->sync_altsetting; ep->ep_idx = fp->sync_ep_idx; } else { ep->iface = fp->iface; ep->altsetting = fp->altsetting; ep->ep_idx = fp->ep_idx; } usb_audio_dbg(chip, "Open EP 0x%x, iface=%d:%d, idx=%d\n", ep_num, ep->iface, ep->altsetting, ep->ep_idx); ep->iface_ref = iface_ref_find(chip, ep->iface); if (!ep->iface_ref) { ep = NULL; goto unlock; } if (fp->protocol != UAC_VERSION_1) { ep->clock_ref = clock_ref_find(chip, fp->clock); if (!ep->clock_ref) { ep = NULL; goto unlock; } ep->clock_ref->opened++; } ep->cur_audiofmt = fp; ep->cur_channels = fp->channels; ep->cur_rate = params_rate(params); ep->cur_format = params_format(params); ep->cur_frame_bytes = snd_pcm_format_physical_width(ep->cur_format) * ep->cur_channels / 8; ep->cur_period_frames = params_period_size(params); ep->cur_period_bytes = ep->cur_period_frames * ep->cur_frame_bytes; ep->cur_buffer_periods = params_periods(params); if (ep->type == SND_USB_ENDPOINT_TYPE_SYNC) endpoint_set_syncinterval(chip, ep); ep->implicit_fb_sync = fp->implicit_fb; ep->need_setup = true; ep->need_prepare = true; ep->fixed_rate = fixed_rate; usb_audio_dbg(chip, " channels=%d, rate=%d, format=%s, period_bytes=%d, periods=%d, implicit_fb=%d\n", ep->cur_channels, ep->cur_rate, snd_pcm_format_name(ep->cur_format), ep->cur_period_bytes, ep->cur_buffer_periods, ep->implicit_fb_sync); } else { if (WARN_ON(!ep->iface_ref)) { ep = NULL; goto unlock; } if (!endpoint_compatible(ep, fp, params)) { usb_audio_err(chip, "Incompatible EP setup for 0x%x\n", ep_num); ep = NULL; goto unlock; } usb_audio_dbg(chip, "Reopened EP 0x%x (count %d)\n", ep_num, ep->opened); } if (!ep->iface_ref->opened++) ep->iface_ref->need_setup = true; ep->opened++; unlock: mutex_unlock(&chip->mutex); return ep; } /* * snd_usb_endpoint_set_sync: Link data and sync endpoints * * Pass NULL to sync_ep to unlink again */ void snd_usb_endpoint_set_sync(struct snd_usb_audio *chip, struct snd_usb_endpoint *data_ep, struct snd_usb_endpoint *sync_ep) { data_ep->sync_source = sync_ep; } /* * Set data endpoint callbacks and the assigned data stream * * Called at PCM trigger and cleanups. * Pass NULL to deactivate each callback. */ void snd_usb_endpoint_set_callback(struct snd_usb_endpoint *ep, int (*prepare)(struct snd_usb_substream *subs, struct urb *urb, bool in_stream_lock), void (*retire)(struct snd_usb_substream *subs, struct urb *urb), struct snd_usb_substream *data_subs) { ep->prepare_data_urb = prepare; ep->retire_data_urb = retire; if (data_subs) ep->lowlatency_playback = data_subs->lowlatency_playback; else ep->lowlatency_playback = false; WRITE_ONCE(ep->data_subs, data_subs); } static int endpoint_set_interface(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep, bool set) { int altset = set ? ep->altsetting : 0; int err; if (ep->iface_ref->altset == altset) return 0; /* already disconnected? */ if (unlikely(atomic_read(&chip->shutdown))) return -ENODEV; usb_audio_dbg(chip, "Setting usb interface %d:%d for EP 0x%x\n", ep->iface, altset, ep->ep_num); err = usb_set_interface(chip->dev, ep->iface, altset); if (err < 0) { usb_audio_err_ratelimited( chip, "%d:%d: usb_set_interface failed (%d)\n", ep->iface, altset, err); return err; } if (chip->quirk_flags & QUIRK_FLAG_IFACE_DELAY) msleep(50); ep->iface_ref->altset = altset; return 0; } /* * snd_usb_endpoint_close: Close the endpoint * * Unreference the already opened endpoint via snd_usb_endpoint_open(). */ void snd_usb_endpoint_close(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { mutex_lock(&chip->mutex); usb_audio_dbg(chip, "Closing EP 0x%x (count %d)\n", ep->ep_num, ep->opened); if (!--ep->iface_ref->opened && !(chip->quirk_flags & QUIRK_FLAG_IFACE_SKIP_CLOSE)) endpoint_set_interface(chip, ep, false); if (!--ep->opened) { if (ep->clock_ref) { if (!--ep->clock_ref->opened) ep->clock_ref->rate = 0; } ep->iface = 0; ep->altsetting = 0; ep->cur_audiofmt = NULL; ep->cur_rate = 0; ep->iface_ref = NULL; ep->clock_ref = NULL; usb_audio_dbg(chip, "EP 0x%x closed\n", ep->ep_num); } mutex_unlock(&chip->mutex); } /* Prepare for suspening EP, called from the main suspend handler */ void snd_usb_endpoint_suspend(struct snd_usb_endpoint *ep) { ep->need_prepare = true; if (ep->iface_ref) ep->iface_ref->need_setup = true; if (ep->clock_ref) ep->clock_ref->rate = 0; } /* * wait until all urbs are processed. */ static int wait_clear_urbs(struct snd_usb_endpoint *ep) { unsigned long end_time = jiffies + msecs_to_jiffies(1000); int alive; if (atomic_read(&ep->state) != EP_STATE_STOPPING) return 0; do { alive = atomic_read(&ep->submitted_urbs); if (!alive) break; schedule_timeout_uninterruptible(1); } while (time_before(jiffies, end_time)); if (alive) usb_audio_err(ep->chip, "timeout: still %d active urbs on EP #%x\n", alive, ep->ep_num); if (ep_state_update(ep, EP_STATE_STOPPING, EP_STATE_STOPPED)) { ep->sync_sink = NULL; snd_usb_endpoint_set_callback(ep, NULL, NULL, NULL); } return 0; } /* sync the pending stop operation; * this function itself doesn't trigger the stop operation */ void snd_usb_endpoint_sync_pending_stop(struct snd_usb_endpoint *ep) { if (ep) wait_clear_urbs(ep); } /* * Stop active urbs * * This function moves the EP to STOPPING state if it's being RUNNING. */ static int stop_urbs(struct snd_usb_endpoint *ep, bool force, bool keep_pending) { unsigned int i; unsigned long flags; if (!force && atomic_read(&ep->running)) return -EBUSY; if (!ep_state_update(ep, EP_STATE_RUNNING, EP_STATE_STOPPING)) return 0; spin_lock_irqsave(&ep->lock, flags); INIT_LIST_HEAD(&ep->ready_playback_urbs); ep->next_packet_head = 0; ep->next_packet_queued = 0; spin_unlock_irqrestore(&ep->lock, flags); if (keep_pending) return 0; for (i = 0; i < ep->nurbs; i++) { if (test_bit(i, &ep->active_mask)) { if (!test_and_set_bit(i, &ep->unlink_mask)) { struct urb *u = ep->urb[i].urb; usb_unlink_urb(u); } } } return 0; } /* * release an endpoint's urbs */ static int release_urbs(struct snd_usb_endpoint *ep, bool force) { int i, err; /* route incoming urbs to nirvana */ snd_usb_endpoint_set_callback(ep, NULL, NULL, NULL); /* stop and unlink urbs */ err = stop_urbs(ep, force, false); if (err) return err; wait_clear_urbs(ep); for (i = 0; i < ep->nurbs; i++) release_urb_ctx(&ep->urb[i]); usb_free_coherent(ep->chip->dev, SYNC_URBS * 4, ep->syncbuf, ep->sync_dma); ep->syncbuf = NULL; ep->nurbs = 0; return 0; } /* * configure a data endpoint */ static int data_ep_set_params(struct snd_usb_endpoint *ep) { struct snd_usb_audio *chip = ep->chip; unsigned int maxsize, minsize, packs_per_ms, max_packs_per_urb; unsigned int max_packs_per_period, urbs_per_period, urb_packs; unsigned int max_urbs, i; const struct audioformat *fmt = ep->cur_audiofmt; int frame_bits = ep->cur_frame_bytes * 8; int tx_length_quirk = (has_tx_length_quirk(chip) && usb_pipeout(ep->pipe)); usb_audio_dbg(chip, "Setting params for data EP 0x%x, pipe 0x%x\n", ep->ep_num, ep->pipe); if (ep->cur_format == SNDRV_PCM_FORMAT_DSD_U16_LE && fmt->dsd_dop) { /* * When operating in DSD DOP mode, the size of a sample frame * in hardware differs from the actual physical format width * because we need to make room for the DOP markers. */ frame_bits += ep->cur_channels << 3; } ep->datainterval = fmt->datainterval; ep->stride = frame_bits >> 3; switch (ep->cur_format) { case SNDRV_PCM_FORMAT_U8: ep->silence_value = 0x80; break; case SNDRV_PCM_FORMAT_DSD_U8: case SNDRV_PCM_FORMAT_DSD_U16_LE: case SNDRV_PCM_FORMAT_DSD_U32_LE: case SNDRV_PCM_FORMAT_DSD_U16_BE: case SNDRV_PCM_FORMAT_DSD_U32_BE: ep->silence_value = 0x69; break; default: ep->silence_value = 0; } /* assume max. frequency is 50% higher than nominal */ ep->freqmax = ep->freqn + (ep->freqn >> 1); /* Round up freqmax to nearest integer in order to calculate maximum * packet size, which must represent a whole number of frames. * This is accomplished by adding 0x0.ffff before converting the * Q16.16 format into integer. * In order to accurately calculate the maximum packet size when * the data interval is more than 1 (i.e. ep->datainterval > 0), * multiply by the data interval prior to rounding. For instance, * a freqmax of 41 kHz will result in a max packet size of 6 (5.125) * frames with a data interval of 1, but 11 (10.25) frames with a * data interval of 2. * (ep->freqmax << ep->datainterval overflows at 8.192 MHz for the * maximum datainterval value of 3, at USB full speed, higher for * USB high speed, noting that ep->freqmax is in units of * frames per packet in Q16.16 format.) */ maxsize = (((ep->freqmax << ep->datainterval) + 0xffff) >> 16) * (frame_bits >> 3); if (tx_length_quirk) maxsize += sizeof(__le32); /* Space for length descriptor */ /* but wMaxPacketSize might reduce this */ if (ep->maxpacksize && ep->maxpacksize < maxsize) { /* whatever fits into a max. size packet */ unsigned int data_maxsize = maxsize = ep->maxpacksize; if (tx_length_quirk) /* Need to remove the length descriptor to calc freq */ data_maxsize -= sizeof(__le32); ep->freqmax = (data_maxsize / (frame_bits >> 3)) << (16 - ep->datainterval); } if (ep->fill_max) ep->curpacksize = ep->maxpacksize; else ep->curpacksize = maxsize; if (snd_usb_get_speed(chip->dev) != USB_SPEED_FULL) { packs_per_ms = 8 >> ep->datainterval; max_packs_per_urb = MAX_PACKS_HS; } else { packs_per_ms = 1; max_packs_per_urb = MAX_PACKS; } if (ep->sync_source && !ep->implicit_fb_sync) max_packs_per_urb = min(max_packs_per_urb, 1U << ep->sync_source->syncinterval); max_packs_per_urb = max(1u, max_packs_per_urb >> ep->datainterval); /* * Capture endpoints need to use small URBs because there's no way * to tell in advance where the next period will end, and we don't * want the next URB to complete much after the period ends. * * Playback endpoints with implicit sync much use the same parameters * as their corresponding capture endpoint. */ if (usb_pipein(ep->pipe) || ep->implicit_fb_sync) { /* make capture URBs <= 1 ms and smaller than a period */ urb_packs = min(max_packs_per_urb, packs_per_ms); while (urb_packs > 1 && urb_packs * maxsize >= ep->cur_period_bytes) urb_packs >>= 1; ep->nurbs = MAX_URBS; /* * Playback endpoints without implicit sync are adjusted so that * a period fits as evenly as possible in the smallest number of * URBs. The total number of URBs is adjusted to the size of the * ALSA buffer, subject to the MAX_URBS and MAX_QUEUE limits. */ } else { /* determine how small a packet can be */ minsize = (ep->freqn >> (16 - ep->datainterval)) * (frame_bits >> 3); /* with sync from device, assume it can be 12% lower */ if (ep->sync_source) minsize -= minsize >> 3; minsize = max(minsize, 1u); /* how many packets will contain an entire ALSA period? */ max_packs_per_period = DIV_ROUND_UP(ep->cur_period_bytes, minsize); /* how many URBs will contain a period? */ urbs_per_period = DIV_ROUND_UP(max_packs_per_period, max_packs_per_urb); /* how many packets are needed in each URB? */ urb_packs = DIV_ROUND_UP(max_packs_per_period, urbs_per_period); /* limit the number of frames in a single URB */ ep->max_urb_frames = DIV_ROUND_UP(ep->cur_period_frames, urbs_per_period); /* try to use enough URBs to contain an entire ALSA buffer */ max_urbs = min((unsigned) MAX_URBS, MAX_QUEUE * packs_per_ms / urb_packs); ep->nurbs = min(max_urbs, urbs_per_period * ep->cur_buffer_periods); } /* allocate and initialize data urbs */ for (i = 0; i < ep->nurbs; i++) { struct snd_urb_ctx *u = &ep->urb[i]; u->index = i; u->ep = ep; u->packets = urb_packs; u->buffer_size = maxsize * u->packets; if (fmt->fmt_type == UAC_FORMAT_TYPE_II) u->packets++; /* for transfer delimiter */ u->urb = usb_alloc_urb(u->packets, GFP_KERNEL); if (!u->urb) goto out_of_memory; u->urb->transfer_buffer = usb_alloc_coherent(chip->dev, u->buffer_size, GFP_KERNEL, &u->urb->transfer_dma); if (!u->urb->transfer_buffer) goto out_of_memory; u->urb->pipe = ep->pipe; u->urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; u->urb->interval = 1 << ep->datainterval; u->urb->context = u; u->urb->complete = snd_complete_urb; INIT_LIST_HEAD(&u->ready_list); } return 0; out_of_memory: release_urbs(ep, false); return -ENOMEM; } /* * configure a sync endpoint */ static int sync_ep_set_params(struct snd_usb_endpoint *ep) { struct snd_usb_audio *chip = ep->chip; int i; usb_audio_dbg(chip, "Setting params for sync EP 0x%x, pipe 0x%x\n", ep->ep_num, ep->pipe); ep->syncbuf = usb_alloc_coherent(chip->dev, SYNC_URBS * 4, GFP_KERNEL, &ep->sync_dma); if (!ep->syncbuf) return -ENOMEM; ep->nurbs = SYNC_URBS; for (i = 0; i < SYNC_URBS; i++) { struct snd_urb_ctx *u = &ep->urb[i]; u->index = i; u->ep = ep; u->packets = 1; u->urb = usb_alloc_urb(1, GFP_KERNEL); if (!u->urb) goto out_of_memory; u->urb->transfer_buffer = ep->syncbuf + i * 4; u->urb->transfer_dma = ep->sync_dma + i * 4; u->urb->transfer_buffer_length = 4; u->urb->pipe = ep->pipe; u->urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; u->urb->number_of_packets = 1; u->urb->interval = 1 << ep->syncinterval; u->urb->context = u; u->urb->complete = snd_complete_urb; } return 0; out_of_memory: release_urbs(ep, false); return -ENOMEM; } /* update the rate of the referred clock; return the actual rate */ static int update_clock_ref_rate(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { struct snd_usb_clock_ref *clock = ep->clock_ref; int rate = ep->cur_rate; if (!clock || clock->rate == rate) return rate; if (clock->rate) { if (atomic_read(&clock->locked)) return clock->rate; if (clock->rate != rate) { usb_audio_err(chip, "Mismatched sample rate %d vs %d for EP 0x%x\n", clock->rate, rate, ep->ep_num); return clock->rate; } } clock->rate = rate; clock->need_setup = true; return rate; } /* * snd_usb_endpoint_set_params: configure an snd_usb_endpoint * * It's called either from hw_params callback. * Determine the number of URBs to be used on this endpoint. * An endpoint must be configured before it can be started. * An endpoint that is already running can not be reconfigured. */ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { const struct audioformat *fmt = ep->cur_audiofmt; int err = 0; mutex_lock(&chip->mutex); if (!ep->need_setup) goto unlock; /* release old buffers, if any */ err = release_urbs(ep, false); if (err < 0) goto unlock; ep->datainterval = fmt->datainterval; ep->maxpacksize = fmt->maxpacksize; ep->fill_max = !!(fmt->attributes & UAC_EP_CS_ATTR_FILL_MAX); if (snd_usb_get_speed(chip->dev) == USB_SPEED_FULL) { ep->freqn = get_usb_full_speed_rate(ep->cur_rate); ep->pps = 1000 >> ep->datainterval; } else { ep->freqn = get_usb_high_speed_rate(ep->cur_rate); ep->pps = 8000 >> ep->datainterval; } ep->sample_rem = ep->cur_rate % ep->pps; ep->packsize[0] = ep->cur_rate / ep->pps; ep->packsize[1] = (ep->cur_rate + (ep->pps - 1)) / ep->pps; /* calculate the frequency in 16.16 format */ ep->freqm = ep->freqn; ep->freqshift = INT_MIN; ep->phase = 0; switch (ep->type) { case SND_USB_ENDPOINT_TYPE_DATA: err = data_ep_set_params(ep); break; case SND_USB_ENDPOINT_TYPE_SYNC: err = sync_ep_set_params(ep); break; default: err = -EINVAL; } usb_audio_dbg(chip, "Set up %d URBS, ret=%d\n", ep->nurbs, err); if (err < 0) goto unlock; /* some unit conversions in runtime */ ep->maxframesize = ep->maxpacksize / ep->cur_frame_bytes; ep->curframesize = ep->curpacksize / ep->cur_frame_bytes; err = update_clock_ref_rate(chip, ep); if (err >= 0) { ep->need_setup = false; err = 0; } unlock: mutex_unlock(&chip->mutex); return err; } static int init_sample_rate(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { struct snd_usb_clock_ref *clock = ep->clock_ref; int rate, err; rate = update_clock_ref_rate(chip, ep); if (rate < 0) return rate; if (clock && !clock->need_setup) return 0; if (!ep->fixed_rate) { err = snd_usb_init_sample_rate(chip, ep->cur_audiofmt, rate); if (err < 0) { if (clock) clock->rate = 0; /* reset rate */ return err; } } if (clock) clock->need_setup = false; return 0; } /* * snd_usb_endpoint_prepare: Prepare the endpoint * * This function sets up the EP to be fully usable state. * It's called either from prepare callback. * The function checks need_setup flag, and performs nothing unless needed, * so it's safe to call this multiple times. * * This returns zero if unchanged, 1 if the configuration has changed, * or a negative error code. */ int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { bool iface_first; int err = 0; mutex_lock(&chip->mutex); if (WARN_ON(!ep->iface_ref)) goto unlock; if (!ep->need_prepare) goto unlock; /* If the interface has been already set up, just set EP parameters */ if (!ep->iface_ref->need_setup) { /* sample rate setup of UAC1 is per endpoint, and we need * to update at each EP configuration */ if (ep->cur_audiofmt->protocol == UAC_VERSION_1) { err = init_sample_rate(chip, ep); if (err < 0) goto unlock; } goto done; } /* Need to deselect altsetting at first */ endpoint_set_interface(chip, ep, false); /* Some UAC1 devices (e.g. Yamaha THR10) need the host interface * to be set up before parameter setups */ iface_first = ep->cur_audiofmt->protocol == UAC_VERSION_1; /* Workaround for devices that require the interface setup at first like UAC1 */ if (chip->quirk_flags & QUIRK_FLAG_SET_IFACE_FIRST) iface_first = true; if (iface_first) { err = endpoint_set_interface(chip, ep, true); if (err < 0) goto unlock; } err = snd_usb_init_pitch(chip, ep->cur_audiofmt); if (err < 0) goto unlock; err = init_sample_rate(chip, ep); if (err < 0) goto unlock; err = snd_usb_select_mode_quirk(chip, ep->cur_audiofmt); if (err < 0) goto unlock; /* for UAC2/3, enable the interface altset here at last */ if (!iface_first) { err = endpoint_set_interface(chip, ep, true); if (err < 0) goto unlock; } ep->iface_ref->need_setup = false; done: ep->need_prepare = false; err = 1; unlock: mutex_unlock(&chip->mutex); return err; } /* get the current rate set to the given clock by any endpoint */ int snd_usb_endpoint_get_clock_rate(struct snd_usb_audio *chip, int clock) { struct snd_usb_clock_ref *ref; int rate = 0; if (!clock) return 0; mutex_lock(&chip->mutex); list_for_each_entry(ref, &chip->clock_ref_list, list) { if (ref->clock == clock) { rate = ref->rate; break; } } mutex_unlock(&chip->mutex); return rate; } /** * snd_usb_endpoint_start: start an snd_usb_endpoint * * @ep: the endpoint to start * * A call to this function will increment the running count of the endpoint. * In case it is not already running, the URBs for this endpoint will be * submitted. Otherwise, this function does nothing. * * Must be balanced to calls of snd_usb_endpoint_stop(). * * Returns an error if the URB submission failed, 0 in all other cases. */ int snd_usb_endpoint_start(struct snd_usb_endpoint *ep) { bool is_playback = usb_pipeout(ep->pipe); int err; unsigned int i; if (atomic_read(&ep->chip->shutdown)) return -EBADFD; if (ep->sync_source) WRITE_ONCE(ep->sync_source->sync_sink, ep); usb_audio_dbg(ep->chip, "Starting %s EP 0x%x (running %d)\n", ep_type_name(ep->type), ep->ep_num, atomic_read(&ep->running)); /* already running? */ if (atomic_inc_return(&ep->running) != 1) return 0; if (ep->clock_ref) atomic_inc(&ep->clock_ref->locked); ep->active_mask = 0; ep->unlink_mask = 0; ep->phase = 0; ep->sample_accum = 0; snd_usb_endpoint_start_quirk(ep); /* * If this endpoint has a data endpoint as implicit feedback source, * don't start the urbs here. Instead, mark them all as available, * wait for the record urbs to return and queue the playback urbs * from that context. */ if (!ep_state_update(ep, EP_STATE_STOPPED, EP_STATE_RUNNING)) goto __error; if (snd_usb_endpoint_implicit_feedback_sink(ep) && !(ep->chip->quirk_flags & QUIRK_FLAG_PLAYBACK_FIRST)) { usb_audio_dbg(ep->chip, "No URB submission due to implicit fb sync\n"); i = 0; goto fill_rest; } for (i = 0; i < ep->nurbs; i++) { struct urb *urb = ep->urb[i].urb; if (snd_BUG_ON(!urb)) goto __error; if (is_playback) err = prepare_outbound_urb(ep, urb->context, true); else err = prepare_inbound_urb(ep, urb->context); if (err < 0) { /* stop filling at applptr */ if (err == -EAGAIN) break; usb_audio_dbg(ep->chip, "EP 0x%x: failed to prepare urb: %d\n", ep->ep_num, err); goto __error; } if (!atomic_read(&ep->chip->shutdown)) err = usb_submit_urb(urb, GFP_ATOMIC); else err = -ENODEV; if (err < 0) { if (!atomic_read(&ep->chip->shutdown)) usb_audio_err(ep->chip, "cannot submit urb %d, error %d: %s\n", i, err, usb_error_string(err)); goto __error; } set_bit(i, &ep->active_mask); atomic_inc(&ep->submitted_urbs); } if (!i) { usb_audio_dbg(ep->chip, "XRUN at starting EP 0x%x\n", ep->ep_num); goto __error; } usb_audio_dbg(ep->chip, "%d URBs submitted for EP 0x%x\n", i, ep->ep_num); fill_rest: /* put the remaining URBs to ready list */ if (is_playback) { for (; i < ep->nurbs; i++) push_back_to_ready_list(ep, ep->urb + i); } return 0; __error: snd_usb_endpoint_stop(ep, false); return -EPIPE; } /** * snd_usb_endpoint_stop: stop an snd_usb_endpoint * * @ep: the endpoint to stop (may be NULL) * @keep_pending: keep in-flight URBs * * A call to this function will decrement the running count of the endpoint. * In case the last user has requested the endpoint stop, the URBs will * actually be deactivated. * * Must be balanced to calls of snd_usb_endpoint_start(). * * The caller needs to synchronize the pending stop operation via * snd_usb_endpoint_sync_pending_stop(). */ void snd_usb_endpoint_stop(struct snd_usb_endpoint *ep, bool keep_pending) { if (!ep) return; usb_audio_dbg(ep->chip, "Stopping %s EP 0x%x (running %d)\n", ep_type_name(ep->type), ep->ep_num, atomic_read(&ep->running)); if (snd_BUG_ON(!atomic_read(&ep->running))) return; if (!atomic_dec_return(&ep->running)) { if (ep->sync_source) WRITE_ONCE(ep->sync_source->sync_sink, NULL); stop_urbs(ep, false, keep_pending); if (ep->clock_ref) atomic_dec(&ep->clock_ref->locked); if (ep->chip->quirk_flags & QUIRK_FLAG_FORCE_IFACE_RESET && usb_pipeout(ep->pipe)) { ep->need_prepare = true; if (ep->iface_ref) ep->iface_ref->need_setup = true; } } } /** * snd_usb_endpoint_release: Tear down an snd_usb_endpoint * * @ep: the endpoint to release * * This function does not care for the endpoint's running count but will tear * down all the streaming URBs immediately. */ void snd_usb_endpoint_release(struct snd_usb_endpoint *ep) { release_urbs(ep, true); } /** * snd_usb_endpoint_free_all: Free the resources of an snd_usb_endpoint * @chip: The chip * * This free all endpoints and those resources */ void snd_usb_endpoint_free_all(struct snd_usb_audio *chip) { struct snd_usb_endpoint *ep, *en; struct snd_usb_iface_ref *ip, *in; struct snd_usb_clock_ref *cp, *cn; list_for_each_entry_safe(ep, en, &chip->ep_list, list) kfree(ep); list_for_each_entry_safe(ip, in, &chip->iface_ref_list, list) kfree(ip); list_for_each_entry_safe(cp, cn, &chip->clock_ref_list, list) kfree(cp); } /* * snd_usb_handle_sync_urb: parse an USB sync packet * * @ep: the endpoint to handle the packet * @sender: the sending endpoint * @urb: the received packet * * This function is called from the context of an endpoint that received * the packet and is used to let another endpoint object handle the payload. */ static void snd_usb_handle_sync_urb(struct snd_usb_endpoint *ep, struct snd_usb_endpoint *sender, const struct urb *urb) { int shift; unsigned int f; unsigned long flags; snd_BUG_ON(ep == sender); /* * In case the endpoint is operating in implicit feedback mode, prepare * a new outbound URB that has the same layout as the received packet * and add it to the list of pending urbs. queue_pending_output_urbs() * will take care of them later. */ if (snd_usb_endpoint_implicit_feedback_sink(ep) && atomic_read(&ep->running)) { /* implicit feedback case */ int i, bytes = 0; struct snd_urb_ctx *in_ctx; struct snd_usb_packet_info *out_packet; in_ctx = urb->context; /* Count overall packet size */ for (i = 0; i < in_ctx->packets; i++) if (urb->iso_frame_desc[i].status == 0) bytes += urb->iso_frame_desc[i].actual_length; /* * skip empty packets. At least M-Audio's Fast Track Ultra stops * streaming once it received a 0-byte OUT URB */ if (bytes == 0) return; spin_lock_irqsave(&ep->lock, flags); if (ep->next_packet_queued >= ARRAY_SIZE(ep->next_packet)) { spin_unlock_irqrestore(&ep->lock, flags); usb_audio_err(ep->chip, "next package FIFO overflow EP 0x%x\n", ep->ep_num); notify_xrun(ep); return; } out_packet = next_packet_fifo_enqueue(ep); /* * Iterate through the inbound packet and prepare the lengths * for the output packet. The OUT packet we are about to send * will have the same amount of payload bytes per stride as the * IN packet we just received. Since the actual size is scaled * by the stride, use the sender stride to calculate the length * in case the number of channels differ between the implicitly * fed-back endpoint and the synchronizing endpoint. */ out_packet->packets = in_ctx->packets; for (i = 0; i < in_ctx->packets; i++) { if (urb->iso_frame_desc[i].status == 0) out_packet->packet_size[i] = urb->iso_frame_desc[i].actual_length / sender->stride; else out_packet->packet_size[i] = 0; } spin_unlock_irqrestore(&ep->lock, flags); snd_usb_queue_pending_output_urbs(ep, false); return; } /* * process after playback sync complete * * Full speed devices report feedback values in 10.14 format as samples * per frame, high speed devices in 16.16 format as samples per * microframe. * * Because the Audio Class 1 spec was written before USB 2.0, many high * speed devices use a wrong interpretation, some others use an * entirely different format. * * Therefore, we cannot predict what format any particular device uses * and must detect it automatically. */ if (urb->iso_frame_desc[0].status != 0 || urb->iso_frame_desc[0].actual_length < 3) return; f = le32_to_cpup(urb->transfer_buffer); if (urb->iso_frame_desc[0].actual_length == 3) f &= 0x00ffffff; else f &= 0x0fffffff; if (f == 0) return; if (unlikely(sender->tenor_fb_quirk)) { /* * Devices based on Tenor 8802 chipsets (TEAC UD-H01 * and others) sometimes change the feedback value * by +/- 0x1.0000. */ if (f < ep->freqn - 0x8000) f += 0xf000; else if (f > ep->freqn + 0x8000) f -= 0xf000; } else if (unlikely(ep->freqshift == INT_MIN)) { /* * The first time we see a feedback value, determine its format * by shifting it left or right until it matches the nominal * frequency value. This assumes that the feedback does not * differ from the nominal value more than +50% or -25%. */ shift = 0; while (f < ep->freqn - ep->freqn / 4) { f <<= 1; shift++; } while (f > ep->freqn + ep->freqn / 2) { f >>= 1; shift--; } ep->freqshift = shift; } else if (ep->freqshift >= 0) f <<= ep->freqshift; else f >>= -ep->freqshift; if (likely(f >= ep->freqn - ep->freqn / 8 && f <= ep->freqmax)) { /* * If the frequency looks valid, set it. * This value is referred to in prepare_playback_urb(). */ spin_lock_irqsave(&ep->lock, flags); ep->freqm = f; spin_unlock_irqrestore(&ep->lock, flags); } else { /* * Out of range; maybe the shift value is wrong. * Reset it so that we autodetect again the next time. */ ep->freqshift = INT_MIN; } }
67 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
149 149 280 307 294 334 234 193 8 33 245 63 200 235 142 108 101 63 85 60 56 6 108 70 193 235 142 157 210 236 236 305 306 2 1 10 21 21 141 1 140 257 137 39 17 25 25 25 303 144 4 143 143 142 1 1 142 142 142 4 141 142 5 141 143 143 144 2 143 98 96 96 97 98 98 206 205 206 206 205 205 1 206 33 28 33 7 7 4 32 27 25 27 9 105 1 133 2 133 301 22 296 92 240 283 295 1 287 3 286 286 26 41 139 153 146 131 133 136 23 251 251 68 247 237 22 223 10 234 7 1 6 6 1 4 4 3 2 1 1 5 1 205 7 205 97 2 98 142 24 139 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * This code builds two trees of free clusters extents. * Trees are sorted by start of extent and by length of extent. * NTFS_MAX_WND_EXTENTS defines the maximum number of elements in trees. * In extreme case code reads on-disk bitmap to find free clusters. * */ #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> #include "ntfs.h" #include "ntfs_fs.h" /* * Maximum number of extents in tree. */ #define NTFS_MAX_WND_EXTENTS (32u * 1024u) struct rb_node_key { struct rb_node node; size_t key; }; struct e_node { struct rb_node_key start; /* Tree sorted by start. */ struct rb_node_key count; /* Tree sorted by len. */ }; static int wnd_rescan(struct wnd_bitmap *wnd); static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw); static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits); static struct kmem_cache *ntfs_enode_cachep; int __init ntfs3_init_bitmap(void) { ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, SLAB_RECLAIM_ACCOUNT, NULL); return ntfs_enode_cachep ? 0 : -ENOMEM; } void ntfs3_exit_bitmap(void) { kmem_cache_destroy(ntfs_enode_cachep); } /* * wnd_scan * * b_pos + b_len - biggest fragment. * Scan range [wpos wbits) window @buf. * * Return: -1 if not found. */ static size_t wnd_scan(const void *buf, size_t wbit, u32 wpos, u32 wend, size_t to_alloc, size_t *prev_tail, size_t *b_pos, size_t *b_len) { while (wpos < wend) { size_t free_len; u32 free_bits, end; u32 used = find_next_zero_bit_le(buf, wend, wpos); if (used >= wend) { if (*b_len < *prev_tail) { *b_pos = wbit - *prev_tail; *b_len = *prev_tail; } *prev_tail = 0; return -1; } if (used > wpos) { wpos = used; if (*b_len < *prev_tail) { *b_pos = wbit - *prev_tail; *b_len = *prev_tail; } *prev_tail = 0; } /* * Now we have a fragment [wpos, wend) staring with 0. */ end = wpos + to_alloc - *prev_tail; free_bits = find_next_bit_le(buf, min(end, wend), wpos); free_len = *prev_tail + free_bits - wpos; if (*b_len < free_len) { *b_pos = wbit + wpos - *prev_tail; *b_len = free_len; } if (free_len >= to_alloc) return wbit + wpos - *prev_tail; if (free_bits >= wend) { *prev_tail += free_bits - wpos; return -1; } wpos = free_bits + 1; *prev_tail = 0; } return -1; } /* * wnd_close - Frees all resources. */ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); while (node) { next = rb_next(node); rb_erase(node, &wnd->start_tree); kmem_cache_free(ntfs_enode_cachep, rb_entry(node, struct e_node, start.node)); node = next; } } static struct rb_node *rb_lookup(struct rb_root *root, size_t v) { struct rb_node **p = &root->rb_node; struct rb_node *r = NULL; while (*p) { struct rb_node_key *k; k = rb_entry(*p, struct rb_node_key, node); if (v < k->key) { p = &(*p)->rb_left; } else if (v > k->key) { r = &k->node; p = &(*p)->rb_right; } else { return &k->node; } } return r; } /* * rb_insert_count - Helper function to insert special kind of 'count' tree. */ static inline bool rb_insert_count(struct rb_root *root, struct e_node *e) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; size_t e_ckey = e->count.key; size_t e_skey = e->start.key; while (*p) { struct e_node *k = rb_entry(parent = *p, struct e_node, count.node); if (e_ckey > k->count.key) { p = &(*p)->rb_left; } else if (e_ckey < k->count.key) { p = &(*p)->rb_right; } else if (e_skey < k->start.key) { p = &(*p)->rb_left; } else if (e_skey > k->start.key) { p = &(*p)->rb_right; } else { WARN_ON(1); return false; } } rb_link_node(&e->count.node, parent, p); rb_insert_color(&e->count.node, root); return true; } /* * rb_insert_start - Helper function to insert special kind of 'count' tree. */ static inline bool rb_insert_start(struct rb_root *root, struct e_node *e) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; size_t e_skey = e->start.key; while (*p) { struct e_node *k; parent = *p; k = rb_entry(parent, struct e_node, start.node); if (e_skey < k->start.key) { p = &(*p)->rb_left; } else if (e_skey > k->start.key) { p = &(*p)->rb_right; } else { WARN_ON(1); return false; } } rb_link_node(&e->start.node, parent, p); rb_insert_color(&e->start.node, root); return true; } /* * wnd_add_free_ext - Adds a new extent of free space. * @build: 1 when building tree. */ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, bool build) { struct e_node *e, *e0 = NULL; size_t ib, end_in = bit + len; struct rb_node *n; if (build) { /* Use extent_min to filter too short extents. */ if (wnd->count >= NTFS_MAX_WND_EXTENTS && len <= wnd->extent_min) { wnd->uptodated = -1; return; } } else { /* Try to find extent before 'bit'. */ n = rb_lookup(&wnd->start_tree, bit); if (!n) { n = rb_first(&wnd->start_tree); } else { e = rb_entry(n, struct e_node, start.node); n = rb_next(n); if (e->start.key + e->count.key == bit) { /* Remove left. */ bit = e->start.key; len += e->count.key; rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; e0 = e; } } while (n) { size_t next_end; e = rb_entry(n, struct e_node, start.node); next_end = e->start.key + e->count.key; if (e->start.key > end_in) break; /* Remove right. */ n = rb_next(n); len += next_end - end_in; end_in = next_end; rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; if (!e0) e0 = e; else kmem_cache_free(ntfs_enode_cachep, e); } if (wnd->uptodated != 1) { /* Check bits before 'bit'. */ ib = wnd->zone_bit == wnd->zone_end || bit < wnd->zone_end ? 0 : wnd->zone_end; while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { bit -= 1; len += 1; } /* Check bits after 'end_in'. */ ib = wnd->zone_bit == wnd->zone_end || end_in > wnd->zone_bit ? wnd->nbits : wnd->zone_bit; while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { end_in += 1; len += 1; } } } /* Insert new fragment. */ if (wnd->count >= NTFS_MAX_WND_EXTENTS) { if (e0) kmem_cache_free(ntfs_enode_cachep, e0); wnd->uptodated = -1; /* Compare with smallest fragment. */ n = rb_last(&wnd->count_tree); e = rb_entry(n, struct e_node, count.node); if (len <= e->count.key) goto out; /* Do not insert small fragments. */ if (build) { struct e_node *e2; n = rb_prev(n); e2 = rb_entry(n, struct e_node, count.node); /* Smallest fragment will be 'e2->count.key'. */ wnd->extent_min = e2->count.key; } /* Replace smallest fragment by new one. */ rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; } else { e = e0 ? e0 : kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); if (!e) { wnd->uptodated = -1; goto out; } if (build && len <= wnd->extent_min) wnd->extent_min = len; } e->start.key = bit; e->count.key = len; if (len > wnd->extent_max) wnd->extent_max = len; rb_insert_start(&wnd->start_tree, e); rb_insert_count(&wnd->count_tree, e); wnd->count += 1; out:; } /* * wnd_remove_free_ext - Remove a run from the cached free space. */ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) { struct rb_node *n, *n3; struct e_node *e, *e3; size_t end_in = bit + len; size_t end3, end, new_key, new_len, max_new_len; /* Try to find extent before 'bit'. */ n = rb_lookup(&wnd->start_tree, bit); if (!n) return; e = rb_entry(n, struct e_node, start.node); end = e->start.key + e->count.key; new_key = new_len = 0; len = e->count.key; /* Range [bit,end_in) must be inside 'e' or outside 'e' and 'n'. */ if (e->start.key > bit) ; else if (end_in <= end) { /* Range [bit,end_in) inside 'e'. */ new_key = end_in; new_len = end - end_in; len = bit - e->start.key; } else if (bit > end) { bool bmax = false; n3 = rb_next(n); while (n3) { e3 = rb_entry(n3, struct e_node, start.node); if (e3->start.key >= end_in) break; if (e3->count.key == wnd->extent_max) bmax = true; end3 = e3->start.key + e3->count.key; if (end3 > end_in) { e3->start.key = end_in; rb_erase(&e3->count.node, &wnd->count_tree); e3->count.key = end3 - end_in; rb_insert_count(&wnd->count_tree, e3); break; } n3 = rb_next(n3); rb_erase(&e3->start.node, &wnd->start_tree); rb_erase(&e3->count.node, &wnd->count_tree); wnd->count -= 1; kmem_cache_free(ntfs_enode_cachep, e3); } if (!bmax) return; n3 = rb_first(&wnd->count_tree); wnd->extent_max = n3 ? rb_entry(n3, struct e_node, count.node)->count.key : 0; return; } if (e->count.key != wnd->extent_max) { ; } else if (rb_prev(&e->count.node)) { ; } else { n3 = rb_next(&e->count.node); max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { e3 = rb_entry(n3, struct e_node, count.node); wnd->extent_max = max(e3->count.key, max_new_len); } } if (!len) { if (new_len) { e->start.key = new_key; rb_erase(&e->count.node, &wnd->count_tree); e->count.key = new_len; rb_insert_count(&wnd->count_tree, e); } else { rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; kmem_cache_free(ntfs_enode_cachep, e); } goto out; } rb_erase(&e->count.node, &wnd->count_tree); e->count.key = len; rb_insert_count(&wnd->count_tree, e); if (!new_len) goto out; if (wnd->count >= NTFS_MAX_WND_EXTENTS) { wnd->uptodated = -1; /* Get minimal extent. */ e = rb_entry(rb_last(&wnd->count_tree), struct e_node, count.node); if (e->count.key > new_len) goto out; /* Replace minimum. */ rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; } else { e = kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); if (!e) wnd->uptodated = -1; } if (e) { e->start.key = new_key; e->count.key = new_len; rb_insert_start(&wnd->start_tree, e); rb_insert_count(&wnd->count_tree, e); wnd->count += 1; } out: if (!wnd->count && 1 != wnd->uptodated) wnd_rescan(wnd); } /* * wnd_rescan - Scan all bitmap. Used while initialization. */ static int wnd_rescan(struct wnd_bitmap *wnd) { int err = 0; size_t prev_tail = 0; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u64 lbo, len = 0; u32 blocksize = sb->s_blocksize; u8 cluster_bits = sbi->cluster_bits; u32 wbits = 8 * sb->s_blocksize; u32 used, frb; size_t wpos, wbit, iw, vbo; struct buffer_head *bh = NULL; CLST lcn, clen; wnd->uptodated = 0; wnd->extent_max = 0; wnd->extent_min = MINUS_ONE_T; wnd->total_zeroes = 0; vbo = 0; for (iw = 0; iw < wnd->nwnd; iw++) { if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; if (wnd->inited) { if (!wnd->free_bits[iw]) { /* All ones. */ if (prev_tail) { wnd_add_free_ext(wnd, vbo * 8 - prev_tail, prev_tail, true); prev_tail = 0; } goto next_wnd; } if (wbits == wnd->free_bits[iw]) { /* All zeroes. */ prev_tail += wbits; wnd->total_zeroes += wbits; goto next_wnd; } } if (!len) { u32 off = vbo & sbi->cluster_mask; if (!run_lookup_entry(&wnd->run, vbo >> cluster_bits, &lcn, &clen, NULL)) { err = -ENOENT; goto out; } lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; } bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); if (!bh) { err = -EIO; goto out; } used = ntfs_bitmap_weight_le(bh->b_data, wbits); if (used < wbits) { frb = wbits - used; wnd->free_bits[iw] = frb; wnd->total_zeroes += frb; } wpos = 0; wbit = vbo * 8; if (wbit + wbits > wnd->nbits) wbits = wnd->nbits - wbit; do { used = find_next_zero_bit_le(bh->b_data, wbits, wpos); if (used > wpos && prev_tail) { wnd_add_free_ext(wnd, wbit + wpos - prev_tail, prev_tail, true); prev_tail = 0; } wpos = used; if (wpos >= wbits) { /* No free blocks. */ prev_tail = 0; break; } frb = find_next_bit_le(bh->b_data, wbits, wpos); if (frb >= wbits) { /* Keep last free block. */ prev_tail += frb - wpos; break; } wnd_add_free_ext(wnd, wbit + wpos - prev_tail, frb + prev_tail - wpos, true); /* Skip free block and first '1'. */ wpos = frb + 1; /* Reset previous tail. */ prev_tail = 0; } while (wpos < wbits); next_wnd: if (bh) put_bh(bh); bh = NULL; vbo += blocksize; if (len) { len -= blocksize; lbo += blocksize; } } /* Add last block. */ if (prev_tail) wnd_add_free_ext(wnd, wnd->nbits - prev_tail, prev_tail, true); /* * Before init cycle wnd->uptodated was 0. * If any errors or limits occurs while initialization then * wnd->uptodated will be -1. * If 'uptodated' is still 0 then Tree is really updated. */ if (!wnd->uptodated) wnd->uptodated = 1; if (wnd->zone_bit != wnd->zone_end) { size_t zlen = wnd->zone_end - wnd->zone_bit; wnd->zone_end = wnd->zone_bit; wnd_zone_set(wnd, wnd->zone_bit, zlen); } out: return err; } int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) { int err; u32 blocksize = sb->s_blocksize; u32 wbits = blocksize * 8; init_rwsem(&wnd->rw_lock); wnd->sb = sb; wnd->nbits = nbits; wnd->total_zeroes = nbits; wnd->extent_max = MINUS_ONE_T; wnd->zone_bit = wnd->zone_end = 0; wnd->nwnd = bytes_to_block(sb, ntfs3_bitmap_size(nbits)); wnd->bits_last = nbits & (wbits - 1); if (!wnd->bits_last) wnd->bits_last = wbits; wnd->free_bits = kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); if (!wnd->free_bits) return -ENOMEM; err = wnd_rescan(wnd); if (err) return err; wnd->inited = true; return 0; } /* * wnd_map - Call sb_bread for requested window. */ static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw) { size_t vbo; CLST lcn, clen; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi; struct buffer_head *bh; u64 lbo; sbi = sb->s_fs_info; vbo = (u64)iw << sb->s_blocksize_bits; if (!run_lookup_entry(&wnd->run, vbo >> sbi->cluster_bits, &lcn, &clen, NULL)) { return ERR_PTR(-ENOENT); } lbo = ((u64)lcn << sbi->cluster_bits) + (vbo & sbi->cluster_mask); bh = ntfs_bread(wnd->sb, lbo >> sb->s_blocksize_bits); if (!bh) return ERR_PTR(-EIO); return bh; } /* * wnd_set_free - Mark the bits range from bit to bit + bits as free. */ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; size_t bits0 = bits; u32 wbits = 8 * sb->s_blocksize; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbit = bit & (wbits - 1); struct buffer_head *bh; while (iw < wnd->nwnd && bits) { u32 tail, op; if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; tail = wbits - wbit; op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } lock_buffer(bh); ntfs_bitmap_clear_le(bh->b_data, wbit, op); wnd->free_bits[iw] += op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); wnd->total_zeroes += op; bits -= op; wbit = 0; iw += 1; } wnd_add_free_ext(wnd, bit, bits0, false); return err; } /* * wnd_set_used - Mark the bits range from bit to bit + bits as used. */ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; size_t bits0 = bits; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); struct buffer_head *bh; while (iw < wnd->nwnd && bits) { u32 tail, op; if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; tail = wbits - wbit; op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } lock_buffer(bh); ntfs_bitmap_set_le(bh->b_data, wbit, op); wnd->free_bits[iw] -= op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); wnd->total_zeroes -= op; bits -= op; wbit = 0; iw += 1; } if (!RB_EMPTY_ROOT(&wnd->start_tree)) wnd_remove_free_ext(wnd, bit, bits0); return err; } /* * wnd_set_used_safe - Mark the bits range from bit to bit + bits as used. * * Unlikely wnd_set_used/wnd_set_free this function is not full trusted. * It scans every bit in bitmap and marks free bit as used. * @done - how many bits were marked as used. * * NOTE: normally *done should be 0. */ int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, size_t *done) { size_t i, from = 0, len = 0; int err = 0; *done = 0; for (i = 0; i < bits; i++) { if (wnd_is_free(wnd, bit + i, 1)) { if (!len) from = bit + i; len += 1; } else if (len) { err = wnd_set_used(wnd, from, len); *done += len; len = 0; if (err) break; } } if (len) { /* last fragment. */ err = wnd_set_used(wnd, from, len); *done += len; } return err; } /* * wnd_is_free_hlp * * Return: True if all clusters [bit, bit+bits) are free (bitmap only). */ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) { struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); while (iw < wnd->nwnd && bits) { u32 tail, op; if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; tail = wbits - wbit; op = min_t(u32, tail, bits); if (wbits != wnd->free_bits[iw]) { bool ret; struct buffer_head *bh = wnd_map(wnd, iw); if (IS_ERR(bh)) return false; ret = are_bits_clear(bh->b_data, wbit, op); put_bh(bh); if (!ret) return false; } bits -= op; wbit = 0; iw += 1; } return true; } /* * wnd_is_free * * Return: True if all clusters [bit, bit+bits) are free. */ bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { bool ret; struct rb_node *n; size_t end; struct e_node *e; if (RB_EMPTY_ROOT(&wnd->start_tree)) goto use_wnd; n = rb_lookup(&wnd->start_tree, bit); if (!n) goto use_wnd; e = rb_entry(n, struct e_node, start.node); end = e->start.key + e->count.key; if (bit < end && bit + bits <= end) return true; use_wnd: ret = wnd_is_free_hlp(wnd, bit, bits); return ret; } /* * wnd_is_used * * Return: True if all clusters [bit, bit+bits) are used. */ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { bool ret = false; struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); size_t end; struct rb_node *n; struct e_node *e; if (RB_EMPTY_ROOT(&wnd->start_tree)) goto use_wnd; end = bit + bits; n = rb_lookup(&wnd->start_tree, end - 1); if (!n) goto use_wnd; e = rb_entry(n, struct e_node, start.node); if (e->start.key + e->count.key > bit) return false; use_wnd: while (iw < wnd->nwnd && bits) { u32 tail, op; if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; tail = wbits - wbit; op = min_t(u32, tail, bits); if (wnd->free_bits[iw]) { bool ret; struct buffer_head *bh = wnd_map(wnd, iw); if (IS_ERR(bh)) goto out; ret = are_bits_set(bh->b_data, wbit, op); put_bh(bh); if (!ret) goto out; } bits -= op; wbit = 0; iw += 1; } ret = true; out: return ret; } /* * wnd_find - Look for free space. * * - flags - BITMAP_FIND_XXX flags * * Return: 0 if not found. */ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, size_t flags, size_t *allocated) { struct super_block *sb; u32 wbits, wpos, wzbit, wzend; size_t fnd, max_alloc, b_len, b_pos; size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend; size_t to_alloc0 = to_alloc; const struct e_node *e; const struct rb_node *pr, *cr; u8 log2_bits; bool fbits_valid; struct buffer_head *bh; /* Fast checking for available free space. */ if (flags & BITMAP_FIND_FULL) { size_t zeroes = wnd_zeroes(wnd); zeroes -= wnd->zone_end - wnd->zone_bit; if (zeroes < to_alloc0) goto no_space; if (to_alloc0 > wnd->extent_max) goto no_space; } else { if (to_alloc > wnd->extent_max) to_alloc = wnd->extent_max; } if (wnd->zone_bit <= hint && hint < wnd->zone_end) hint = wnd->zone_end; max_alloc = wnd->nbits; b_len = b_pos = 0; if (hint >= max_alloc) hint = 0; if (RB_EMPTY_ROOT(&wnd->start_tree)) { if (wnd->uptodated == 1) { /* Extents tree is updated -> No free space. */ goto no_space; } goto scan_bitmap; } e = NULL; if (!hint) goto allocate_biggest; /* Use hint: Enumerate extents by start >= hint. */ pr = NULL; cr = wnd->start_tree.rb_node; for (;;) { e = rb_entry(cr, struct e_node, start.node); if (e->start.key == hint) break; if (e->start.key < hint) { pr = cr; cr = cr->rb_right; if (!cr) break; continue; } cr = cr->rb_left; if (!cr) { e = pr ? rb_entry(pr, struct e_node, start.node) : NULL; break; } } if (!e) goto allocate_biggest; if (e->start.key + e->count.key > hint) { /* We have found extension with 'hint' inside. */ size_t len = e->start.key + e->count.key - hint; if (len >= to_alloc && hint + to_alloc <= max_alloc) { fnd = hint; goto found; } if (!(flags & BITMAP_FIND_FULL)) { if (len > to_alloc) len = to_alloc; if (hint + len <= max_alloc) { fnd = hint; to_alloc = len; goto found; } } } allocate_biggest: /* Allocate from biggest free extent. */ e = rb_entry(rb_first(&wnd->count_tree), struct e_node, count.node); if (e->count.key != wnd->extent_max) wnd->extent_max = e->count.key; if (e->count.key < max_alloc) { if (e->count.key >= to_alloc) { ; } else if (flags & BITMAP_FIND_FULL) { if (e->count.key < to_alloc0) { /* Biggest free block is less then requested. */ goto no_space; } to_alloc = e->count.key; } else if (-1 != wnd->uptodated) { to_alloc = e->count.key; } else { /* Check if we can use more bits. */ size_t op, max_check; struct rb_root start_tree; memcpy(&start_tree, &wnd->start_tree, sizeof(struct rb_root)); memset(&wnd->start_tree, 0, sizeof(struct rb_root)); max_check = e->start.key + to_alloc; if (max_check > max_alloc) max_check = max_alloc; for (op = e->start.key + e->count.key; op < max_check; op++) { if (!wnd_is_free(wnd, op, 1)) break; } memcpy(&wnd->start_tree, &start_tree, sizeof(struct rb_root)); to_alloc = op - e->start.key; } /* Prepare to return. */ fnd = e->start.key; if (e->start.key + to_alloc > max_alloc) to_alloc = max_alloc - e->start.key; goto found; } if (wnd->uptodated == 1) { /* Extents tree is updated -> no free space. */ goto no_space; } b_len = e->count.key; b_pos = e->start.key; scan_bitmap: sb = wnd->sb; log2_bits = sb->s_blocksize_bits + 3; /* At most two ranges [hint, max_alloc) + [0, hint). */ Again: /* TODO: Optimize request for case nbits > wbits. */ iw = hint >> log2_bits; wbits = sb->s_blocksize * 8; wpos = hint & (wbits - 1); prev_tail = 0; fbits_valid = true; if (max_alloc == wnd->nbits) { nwnd = wnd->nwnd; } else { size_t t = max_alloc + wbits - 1; nwnd = likely(t > max_alloc) ? (t >> log2_bits) : wnd->nwnd; } /* Enumerate all windows. */ for (; iw < nwnd; iw++) { wbit = iw << log2_bits; if (!wnd->free_bits[iw]) { if (prev_tail > b_len) { b_pos = wbit - prev_tail; b_len = prev_tail; } /* Skip full used window. */ prev_tail = 0; wpos = 0; continue; } if (unlikely(iw + 1 == nwnd)) { if (max_alloc == wnd->nbits) { wbits = wnd->bits_last; } else { size_t t = max_alloc & (wbits - 1); if (t) { wbits = t; fbits_valid = false; } } } if (wnd->zone_end > wnd->zone_bit) { ebit = wbit + wbits; zbit = max(wnd->zone_bit, wbit); zend = min(wnd->zone_end, ebit); /* Here we have a window [wbit, ebit) and zone [zbit, zend). */ if (zend <= zbit) { /* Zone does not overlap window. */ } else { wzbit = zbit - wbit; wzend = zend - wbit; /* Zone overlaps window. */ if (wnd->free_bits[iw] == wzend - wzbit) { prev_tail = 0; wpos = 0; continue; } /* Scan two ranges window: [wbit, zbit) and [zend, ebit). */ bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { /* TODO: Error */ prev_tail = 0; wpos = 0; continue; } /* Scan range [wbit, zbit). */ if (wpos < wzbit) { /* Scan range [wpos, zbit). */ fnd = wnd_scan(bh->b_data, wbit, wpos, wzbit, to_alloc, &prev_tail, &b_pos, &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; } } prev_tail = 0; /* Scan range [zend, ebit). */ if (wzend < wbits) { fnd = wnd_scan(bh->b_data, wbit, max(wzend, wpos), wbits, to_alloc, &prev_tail, &b_pos, &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; } } wpos = 0; put_bh(bh); continue; } } /* Current window does not overlap zone. */ if (!wpos && fbits_valid && wnd->free_bits[iw] == wbits) { /* Window is empty. */ if (prev_tail + wbits >= to_alloc) { fnd = wbit + wpos - prev_tail; goto found; } /* Increase 'prev_tail' and process next window. */ prev_tail += wbits; wpos = 0; continue; } /* Read window. */ bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { // TODO: Error. prev_tail = 0; wpos = 0; continue; } /* Scan range [wpos, eBits). */ fnd = wnd_scan(bh->b_data, wbit, wpos, wbits, to_alloc, &prev_tail, &b_pos, &b_len); put_bh(bh); if (fnd != MINUS_ONE_T) goto found; } if (b_len < prev_tail) { /* The last fragment. */ b_len = prev_tail; b_pos = max_alloc - prev_tail; } if (hint) { /* * We have scanned range [hint max_alloc). * Prepare to scan range [0 hint + to_alloc). */ size_t nextmax = hint + to_alloc; if (likely(nextmax >= hint) && nextmax < max_alloc) max_alloc = nextmax; hint = 0; goto Again; } if (!b_len) goto no_space; wnd->extent_max = b_len; if (flags & BITMAP_FIND_FULL) goto no_space; fnd = b_pos; to_alloc = b_len; found: if (flags & BITMAP_FIND_MARK_AS_USED) { /* TODO: Optimize remove extent (pass 'e'?). */ if (wnd_set_used(wnd, fnd, to_alloc)) goto no_space; } else if (wnd->extent_max != MINUS_ONE_T && to_alloc > wnd->extent_max) { wnd->extent_max = to_alloc; } *allocated = fnd; return to_alloc; no_space: return 0; } /* * wnd_extend - Extend bitmap ($MFT bitmap). */ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) { int err; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u32 blocksize = sb->s_blocksize; u32 wbits = blocksize * 8; u32 b0, new_last; size_t bits, iw, new_wnd; size_t old_bits = wnd->nbits; u16 *new_free; if (new_bits <= old_bits) return -EINVAL; /* Align to 8 byte boundary. */ new_wnd = bytes_to_block(sb, ntfs3_bitmap_size(new_bits)); new_last = new_bits & (wbits - 1); if (!new_last) new_last = wbits; if (new_wnd != wnd->nwnd) { new_free = kmalloc_array(new_wnd, sizeof(u16), GFP_NOFS); if (!new_free) return -ENOMEM; memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); kvfree(wnd->free_bits); wnd->free_bits = new_free; } /* Zero bits [old_bits,new_bits). */ bits = new_bits - old_bits; b0 = old_bits & (wbits - 1); for (iw = old_bits >> (sb->s_blocksize_bits + 3); bits; iw += 1) { u32 op; size_t frb; u64 vbo, lbo, bytes; struct buffer_head *bh; if (iw + 1 == new_wnd) wbits = new_last; op = b0 + bits > wbits ? wbits - b0 : bits; vbo = (u64)iw * blocksize; err = ntfs_vbo_to_lbo(sbi, &wnd->run, vbo, &lbo, &bytes); if (err) return err; bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); if (!bh) return -EIO; lock_buffer(bh); ntfs_bitmap_clear_le(bh->b_data, b0, blocksize * 8 - b0); frb = wbits - ntfs_bitmap_weight_le(bh->b_data, wbits); wnd->total_zeroes += frb - wnd->free_bits[iw]; wnd->free_bits[iw] = frb; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); /* err = sync_dirty_buffer(bh); */ b0 = 0; bits -= op; } wnd->nbits = new_bits; wnd->nwnd = new_wnd; wnd->bits_last = new_last; wnd_add_free_ext(wnd, old_bits, new_bits - old_bits, false); return 0; } void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len) { size_t zlen = wnd->zone_end - wnd->zone_bit; if (zlen) wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false); if (!RB_EMPTY_ROOT(&wnd->start_tree) && len) wnd_remove_free_ext(wnd, lcn, len); wnd->zone_bit = lcn; wnd->zone_end = lcn + len; } int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) { int err = 0; struct super_block *sb = sbi->sb; struct wnd_bitmap *wnd = &sbi->used.bitmap; u32 wbits = 8 * sb->s_blocksize; CLST len = 0, lcn = 0, done = 0; CLST minlen = bytes_to_cluster(sbi, range->minlen); CLST lcn_from = bytes_to_cluster(sbi, range->start); size_t iw = lcn_from >> (sb->s_blocksize_bits + 3); u32 wbit = lcn_from & (wbits - 1); CLST lcn_to; if (!minlen) minlen = 1; if (range->len == (u64)-1) lcn_to = wnd->nbits; else lcn_to = bytes_to_cluster(sbi, range->start + range->len); down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); for (; iw < wnd->nwnd; iw++, wbit = 0) { CLST lcn_wnd = iw * wbits; struct buffer_head *bh; if (lcn_wnd > lcn_to) break; if (!wnd->free_bits[iw]) continue; if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; if (lcn_wnd + wbits > lcn_to) wbits = lcn_to - lcn_wnd; bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } for (; wbit < wbits; wbit++) { if (!test_bit_le(wbit, bh->b_data)) { if (!len) lcn = lcn_wnd + wbit; len += 1; continue; } if (len >= minlen) { err = ntfs_discard(sbi, lcn, len); if (err) goto out; done += len; } len = 0; } put_bh(bh); } /* Process the last fragment. */ if (len >= minlen) { err = ntfs_discard(sbi, lcn, len); if (err) goto out; done += len; } out: range->len = (u64)done << sbi->cluster_bits; up_read(&wnd->rw_lock); return err; } #if BITS_PER_LONG == 64 typedef __le64 bitmap_ulong; #define cpu_to_ul(x) cpu_to_le64(x) #define ul_to_cpu(x) le64_to_cpu(x) #else typedef __le32 bitmap_ulong; #define cpu_to_ul(x) cpu_to_le32(x) #define ul_to_cpu(x) le32_to_cpu(x) #endif void ntfs_bitmap_set_le(void *map, unsigned int start, int len) { bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); bitmap_ulong mask_to_set = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = cpu_to_ul(~0UL); p++; } if (len) { mask_to_set &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); *p |= mask_to_set; } } void ntfs_bitmap_clear_le(void *map, unsigned int start, int len) { bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); bitmap_ulong mask_to_clear = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = cpu_to_ul(~0UL); p++; } if (len) { mask_to_clear &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); *p &= ~mask_to_clear; } } unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits) { const ulong *bmp = bitmap; unsigned int k, lim = bits / BITS_PER_LONG; unsigned int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bmp[k]); if (bits % BITS_PER_LONG) { w += hweight_long(ul_to_cpu(((bitmap_ulong *)bitmap)[k]) & BITMAP_LAST_WORD_MASK(bits)); } return w; }
22 21 22 22 22 22 22 22 22 22 22 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/mac80211/util.c */ #include "ieee802154_i.h" #include "driver-ops.h" /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; /** * ieee802154_wake_queue - wake ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we had to stop the queue to * avoid new skb to come during the transmission. The queue then needs to be * woken up after the operation. */ static void ieee802154_wake_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_wake_queue(sdata->dev); } rcu_read_unlock(); } /** * ieee802154_stop_queue - stop ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we need to tell upper layers to * stop giving us new skbs while we are busy with the transmitted one. The queue * must then be stopped before transmitting. */ static void ieee802154_stop_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_stop_queue(sdata->dev); } rcu_read_unlock(); } void ieee802154_hold_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (!atomic_fetch_inc(&local->phy->hold_txs)) ieee802154_stop_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_release_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (atomic_dec_and_test(&local->phy->hold_txs)) ieee802154_wake_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_disable_queue(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_tx_disable(sdata->dev); } rcu_read_unlock(); } enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = container_of(timer, struct ieee802154_local, ifs_timer); ieee802154_release_queue(local); return HRTIMER_NORESTART; } void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, bool ifs_handling) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = IEEE802154_SUCCESS; if (ifs_handling) { u8 max_sifs_size; /* If transceiver sets CRC on his own we need to use lifs * threshold len above 16 otherwise 18, because it's not * part of skb->len. */ if (hw->flags & IEEE802154_HW_TX_OMIT_CKSUM) max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE - IEEE802154_FCS_LEN; else max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE; if (skb->len > max_sifs_size) hrtimer_start(&local->ifs_timer, hw->phy->lifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); else hrtimer_start(&local->ifs_timer, hw->phy->sifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); } else { ieee802154_release_queue(local); } dev_consume_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, int reason) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = reason; ieee802154_release_queue(local); dev_kfree_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb) { ieee802154_xmit_error(hw, skb, IEEE802154_SYSTEM_ERROR); } EXPORT_SYMBOL(ieee802154_xmit_hw_error); void ieee802154_stop_device(struct ieee802154_local *local) { flush_workqueue(local->workqueue); hrtimer_cancel(&local->ifs_timer); drv_stop(local); }
32 33 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif vdata[CS_RAW].mask = tk->tkr_raw.mask; vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). * Note: No need to have a second copy. */ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); } void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); timekeeper_unlock_irqrestore(flags); }
1 6 1 6 6 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0-only #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> #include <linux/sysctl.h> #include <net/ip_vs.h> /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); /* semaphore for IPVS PEs. */ static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) { struct ip_vs_pe *pe; IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); rcu_read_lock(); list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { /* This pe is just deleted */ continue; } if (strcmp(pe_name, pe->name)==0) { /* HIT */ rcu_read_unlock(); return pe; } module_put(pe->module); } rcu_read_unlock(); return NULL; } /* Lookup pe and try to load it if it doesn't exist */ struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) { struct ip_vs_pe *pe; /* Search for the pe by name */ pe = __ip_vs_pe_getbyname(name); /* If pe not found, load the module and search again */ if (!pe) { request_module("ip_vs_pe_%s", name); pe = __ip_vs_pe_getbyname(name); } return pe; } /* Register a pe in the pe list */ int register_ip_vs_pe(struct ip_vs_pe *pe) { struct ip_vs_pe *tmp; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); return -EINVAL; } } /* Add it into the d-linked pe list */ list_add_rcu(&pe->n_list, &ip_vs_pe); mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); return 0; } EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ list_del_rcu(&pe->n_list); mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); pr_info("[%s] pe unregistered.\n", pe->name); return 0; } EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
54 3 51 23 22 21 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Single-block cipher operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <linux/kernel.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); int ret; u8 *buffer, *alignbuffer; unsigned long absize; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_cipher_setkey(struct crypto_cipher *tfm, const u8 *key, unsigned int keylen) { struct cipher_alg *cia = crypto_cipher_alg(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) return setkey_unaligned(tfm, key, keylen); return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, CRYPTO_INTERNAL); static inline void cipher_crypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src, bool enc) { unsigned long alignmask = crypto_cipher_alignmask(tfm); struct cipher_alg *cia = crypto_cipher_alg(tfm); void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = enc ? cia->cia_encrypt : cia->cia_decrypt; if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { unsigned int bs = crypto_cipher_blocksize(tfm); u8 buffer[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(tmp, src, bs); fn(crypto_cipher_tfm(tfm), tmp, tmp); memcpy(dst, tmp, bs); } else { fn(crypto_cipher_tfm(tfm), dst, src); } } void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, true); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, CRYPTO_INTERNAL); void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, u8 *dst, const u8 *src) { cipher_crypt_one(tfm, dst, src, false); } EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, CRYPTO_INTERNAL); struct crypto_cipher *crypto_clone_cipher(struct crypto_cipher *cipher) { struct crypto_tfm *tfm = crypto_cipher_tfm(cipher); struct crypto_alg *alg = tfm->__crt_alg; struct crypto_cipher *ncipher; struct crypto_tfm *ntfm; if (alg->cra_init) return ERR_PTR(-ENOSYS); if (unlikely(!crypto_mod_get(alg))) return ERR_PTR(-ESTALE); ntfm = __crypto_alloc_tfmgfp(alg, CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK, GFP_ATOMIC); if (IS_ERR(ntfm)) { crypto_mod_put(alg); return ERR_CAST(ntfm); } ntfm->crt_flags = tfm->crt_flags; ncipher = __crypto_cipher_cast(ntfm); return ncipher; } EXPORT_SYMBOL_GPL(crypto_clone_cipher);
11 11 11 11 11 11 11 11 10 11 10 10 10 11 11 11 11 11 10 11 11 11 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/rbtree.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <rdma/iw_cm.h> #include <rdma/ib_addr.h> #include <rdma/iw_portmap.h> #include <rdma/rdma_netlink.h> #include "iwcm.h" MODULE_AUTHOR("Tom Tucker"); MODULE_DESCRIPTION("iWARP CM"); MODULE_LICENSE("Dual BSD/GPL"); static const char * const iwcm_rej_reason_strs[] = { [ECONNRESET] = "reset by remote host", [ECONNREFUSED] = "refused by remote application", [ETIMEDOUT] = "setup timeout", }; const char *__attribute_const__ iwcm_reject_msg(int reason) { size_t index; /* iWARP uses negative errnos */ index = -reason; if (index < ARRAY_SIZE(iwcm_rej_reason_strs) && iwcm_rej_reason_strs[index]) return iwcm_rej_reason_strs[index]; else return "unrecognized reason"; } EXPORT_SYMBOL(iwcm_reject_msg); static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; struct list_head list; struct iw_cm_event event; struct list_head free_list; }; static unsigned int default_backlog = 256; static struct ctl_table_header *iwcm_ctl_table_hdr; static struct ctl_table iwcm_ctl_table[] = { { .procname = "default_backlog", .data = &default_backlog, .maxlen = sizeof(default_backlog), .mode = 0644, .proc_handler = proc_dointvec, }, }; /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: * LISTENING IDS: Get enough elements preallocated to handle the * listen backlog. * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE * * Allocating them in connect and listen avoids having to deal * with allocation failures on the event upcall from the provider (which * is called in the interrupt context). * * One exception is when creating the cm_id for incoming connection requests. * There are two cases: * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If * the backlog is exceeded, then no more connection request events will * be processed. cm_event_handler() returns -ENOMEM in this case. Its up * to the provider to reject the connection request. * 2) in the connection request workqueue handler, cm_conn_req_handler(). * If work elements cannot be allocated for the new connect request cm_id, * then IWCM will call the provider reject method. This is ok since * cm_conn_req_handler() runs in the workqueue thread context. */ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) { struct iwcm_work *work; if (list_empty(&cm_id_priv->work_free_list)) return NULL; work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work, free_list); list_del_init(&work->free_list); return work; } static void put_work(struct iwcm_work *work) { list_add(&work->free_list, &work->cm_id->work_free_list); } static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) { struct list_head *e, *tmp; list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) { list_del(e); kfree(list_entry(e, struct iwcm_work, free_list)); } } static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) { struct iwcm_work *work; BUG_ON(!list_empty(&cm_id_priv->work_free_list)); while (count--) { work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); if (!work) { dealloc_work_entries(cm_id_priv); return -ENOMEM; } work->cm_id = cm_id_priv; INIT_LIST_HEAD(&work->list); put_work(work); } return 0; } /* * Save private data from incoming connection requests to * iw_cm_event, so the low level driver doesn't have to. Adjust * the event ptr to point to the local copy. */ static int copy_private_data(struct iw_cm_event *event) { void *p; p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); if (!p) return -ENOMEM; event->private_data = p; return 0; } static void free_cm_id(struct iwcm_id_private *cm_id_priv) { dealloc_work_entries(cm_id_priv); kfree(cm_id_priv); } /* * Release a reference on cm_id. If the last reference is being * released, free the cm_id and return 'true'. */ static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); return true; } return false; } static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); refcount_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); (void)iwcm_deref_id(cm_id_priv); } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, iw_cm_handler cm_handler, void *context) { struct iwcm_id_private *cm_id_priv; cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->state = IW_CM_STATE_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; spin_lock_init(&cm_id_priv->lock); refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; } EXPORT_SYMBOL(iw_create_cm_id); static int iwcm_modify_qp_err(struct ib_qp *qp) { struct ib_qp_attr qp_attr; if (!qp) return -EINVAL; qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * This is really the RDMAC CLOSING state. It is most similar to the * IB SQD QP state. */ static int iwcm_modify_qp_sqd(struct ib_qp *qp) { struct ib_qp_attr qp_attr; BUG_ON(qp == NULL); qp_attr.qp_state = IB_QPS_SQD; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * CM_ID <-- CLOSING * * Block if a passive or active connection is currently being processed. Then * process the event as follows: * - If we are ESTABLISHED, move to CLOSING and modify the QP state * based on the abrupt flag * - If the connection is already in the CLOSING or IDLE state, the peer is * disconnecting concurrently with us and we've already seen the * DISCONNECT event -- ignore the request and return 0 * - Disconnect on a listening endpoint returns -EINVAL */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* Wait if we're currently in a connect or accept downcall */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_CLOSING; /* QP could be <nul> for user-mode client */ if (cm_id_priv->qp) qp = cm_id_priv->qp; else ret = -EINVAL; break; case IW_CM_STATE_LISTEN: ret = -EINVAL; break; case IW_CM_STATE_CLOSING: /* remote peer closed first */ case IW_CM_STATE_IDLE: /* accept or connect returned !0 */ break; case IW_CM_STATE_CONN_RECV: /* * App called disconnect before/without calling accept after * connect_request event delivered. */ break; case IW_CM_STATE_CONN_SENT: /* Can only get here if wait above fails */ default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) { if (abrupt) ret = iwcm_modify_qp_err(qp); else ret = iwcm_modify_qp_sqd(qp); /* * If both sides are disconnecting the QP could * already be in ERR or SQD states */ ret = 0; } return ret; } EXPORT_SYMBOL(iw_cm_disconnect); /* * CM_ID <-- DESTROYING * * Clean up all resources associated with the connection and release * the initial reference taken by iw_create_cm_id. * * Returns true if and only if the last cm_id_priv reference has been dropped. */ static bool destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* * Wait if we're currently in a connect or accept downcall. A * listening endpoint should never block here. */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); /* * Since we're deleting the cm_id, drop any events that * might arrive before the last dereference. */ set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* destroy the listening endpoint */ cm_id->device->ops.iw_destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* Abrupt close of the connection */ (void)iwcm_modify_qp_err(qp); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_IDLE: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_DESTROYING; break; case IW_CM_STATE_CONN_RECV: /* * App called destroy before/without calling accept after * receiving connection request event notification or * returned non zero from the event callback function. * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id->device->ops.iw_reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: default: BUG(); break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id_priv->id.device->ops.iw_rem_ref(qp); if (cm_id->mapped) { iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } return iwcm_deref_id(cm_id_priv); } /* * This function is only called by the application thread and cannot * be called by the event thread. The function will wait for all * references to be released on the cm_id and then kfree the cm_id * object. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { if (!destroy_cm_id(cm_id)) flush_workqueue(iwcm_wq); } EXPORT_SYMBOL(iw_destroy_cm_id); /** * iw_cm_check_wildcard - If IP address is 0 then use original * @pm_addr: sockaddr containing the ip to check for wildcard * @cm_addr: sockaddr containing the actual IP address * @cm_outaddr: sockaddr to set IP addr which leaving port * * Checks the pm_addr for wildcard and then sets cm_outaddr's * IP to the actual (cm_addr). */ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, struct sockaddr_storage *cm_addr, struct sockaddr_storage *cm_outaddr) { if (pm_addr->ss_family == AF_INET) { struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { struct sockaddr_in *cm4_addr = (struct sockaddr_in *)cm_addr; struct sockaddr_in *cm4_outaddr = (struct sockaddr_in *)cm_outaddr; cm4_outaddr->sin_addr = cm4_addr->sin_addr; } } else { struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr; if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) { struct sockaddr_in6 *cm6_addr = (struct sockaddr_in6 *)cm_addr; struct sockaddr_in6 *cm6_outaddr = (struct sockaddr_in6 *)cm_outaddr; cm6_outaddr->sin6_addr = cm6_addr->sin6_addr; } } } /** * iw_cm_map - Use portmapper to map the ports * @cm_id: connection manager pointer * @active: Indicates the active side when true * returns nonzero for error only if iwpm_create_mapinfo() fails * * Tries to add a mapping for a port using the Portmapper. If * successful in mapping the IP/Port it will check the remote * mapped IP address for a wildcard IP address and replace the * zero IP address with the remote_addr. */ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) { const char *devname = dev_name(&cm_id->device->dev); const char *ifname = cm_id->device->iw_ifname; struct iwpm_dev_data pm_reg_msg = {}; struct iwpm_sa_data pm_msg; int status; if (strlen(devname) >= sizeof(pm_reg_msg.dev_name) || strlen(ifname) >= sizeof(pm_reg_msg.if_name)) return -EINVAL; cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; strcpy(pm_reg_msg.dev_name, devname); strcpy(pm_reg_msg.if_name, ifname); if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || !iwpm_valid_pid()) return 0; cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ? IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); else status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM); if (!status) { cm_id->m_local_addr = pm_msg.mapped_loc_addr; if (active) { cm_id->m_remote_addr = pm_msg.mapped_rem_addr; iw_cm_check_wildcard(&pm_msg.mapped_rem_addr, &cm_id->remote_addr, &cm_id->m_remote_addr); } } return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, RDMA_NL_IWCM, pm_msg.flags); } /* * CM_ID <-- LISTEN * * Start listening for connect requests. Generates one CONNECT_REQUEST * event for each inbound connect request. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); if (!backlog) backlog = default_backlog; ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, false); if (!ret) ret = cm_id->device->ops.iw_create_listen(cm_id, backlog); if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: ret = -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(iw_cm_listen); /* * CM_ID <-- IDLE * * Rejects an inbound connection request. No events are generated. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->ops.iw_reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_reject); /* * CM_ID <-- ESTABLISHED * * Accepts an inbound connection request and generates an ESTABLISHED * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block * until the ESTABLISHED event is received from the provider. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->ops.iw_accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id->device->ops.iw_rem_ref(qp); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_accept); /* * Active Side: CM_ID <-- CONN_SENT * * If successful, results in the generation of a CONNECT_REPLY * event. iw_cm_disconnect and iw_cm_destroy will block until the * CONNECT_REPLY event is received from the provider. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; int ret; unsigned long flags; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, 4); if (ret) return ret; set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { ret = -EINVAL; goto err; } /* Get the ib_qp given the QPN */ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { ret = -EINVAL; goto err; } cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, true); if (!ret) ret = cm_id->device->ops.iw_connect(cm_id, iw_param); if (!ret) return 0; /* success */ spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; err: spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id->device->ops.iw_rem_ref(qp); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_connect); /* * Passive Side: new CM_ID <-- CONN_RECV * * Handles an inbound connect request. The function creates a new * iw_cm_id to represent the new connection and inherits the client * callback function and other attributes from the listening parent. * * The work item contains a pointer to the listen_cm_id and the event. The * listen_cm_id contains the client cm_handler, context and * device. These are copied when the device is cloned. The event * contains the new four tuple. * * An error on the child should not affect the parent, so this * function does not return a value. */ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; struct iw_cm_id *cm_id; struct iwcm_id_private *cm_id_priv; int ret; /* * The provider should never generate a connection request * event with a bad status. */ BUG_ON(iw_event->status); cm_id = iw_create_cm_id(listen_id_priv->id.device, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ if (IS_ERR(cm_id)) goto out; cm_id->provider_data = iw_event->provider_data; cm_id->m_local_addr = iw_event->local_addr; cm_id->m_remote_addr = iw_event->remote_addr; cm_id->local_addr = listen_id_priv->id.local_addr; ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr, &iw_event->remote_addr, &cm_id->remote_addr, RDMA_NL_IWCM); if (ret) { cm_id->remote_addr = iw_event->remote_addr; } else { iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr, &iw_event->local_addr, &cm_id->local_addr); iw_event->local_addr = cm_id->local_addr; iw_event->remote_addr = cm_id->remote_addr; } cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; /* * We could be destroying the listening id. If so, ignore this * upcall. */ spin_lock_irqsave(&listen_id_priv->lock, flags); if (listen_id_priv->state != IW_CM_STATE_LISTEN) { spin_unlock_irqrestore(&listen_id_priv->lock, flags); iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } spin_unlock_irqrestore(&listen_id_priv->lock, flags); ret = alloc_work_entries(cm_id_priv, 3); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } /* Call the client CM handler */ ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); } out: if (iw_event->private_data_len) kfree(iw_event->private_data); } /* * Passive Side: CM_ID <-- ESTABLISHED * * The provider generated an ESTABLISHED event which means that * the MPA negotion has completed successfully and we are now in MPA * FPDU mode. * * This event can only be received in the CONN_RECV state. If the * remote peer closed, the ESTABLISHED event would be received followed * by the CLOSE event. If the app closes, it will block until we wake * it up after processing this event. */ static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * We clear the CONNECT_WAIT bit here to allow the callback * function to call iw_cm_disconnect. Calling iw_destroy_cm_id * from a callback handler is not allowed. */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_ESTABLISHED; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * Active Side: CM_ID <-- ESTABLISHED * * The app has called connect and is waiting for the established event to * post it's requests to the server. This event will wake up anyone * blocked in iw_cm_disconnect or iw_destroy_id. */ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { struct ib_qp *qp = NULL; unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * Clear the connect wait bit so a callback function calling * iw_cm_disconnect will not wait and deadlock this thread */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { cm_id_priv->id.m_local_addr = iw_event->local_addr; cm_id_priv->id.m_remote_addr = iw_event->remote_addr; iw_event->local_addr = cm_id_priv->id.local_addr; iw_event->remote_addr = cm_id_priv->id.remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ qp = cm_id_priv->qp; cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id_priv->id.device->ops.iw_rem_ref(qp); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); if (iw_event->private_data_len) kfree(iw_event->private_data); /* Wake up waiters on connect complete */ wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * CM_ID <-- CLOSING * * If in the ESTABLISHED state, move to CLOSING. */ static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) cm_id_priv->state = IW_CM_STATE_CLOSING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * CM_ID <-- IDLE * * If in the ESTBLISHED or CLOSING states, the QP will have have been * moved by the provider to the ERR state. Disassociate the CM_ID from * the QP, move to IDLE, and remove the 'connected' reference. * * If in some other state, the cm_id was destroyed asynchronously. * This is the last reference that will result in waking up * the app thread blocked in iw_destroy_cm_id. */ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { struct ib_qp *qp; unsigned long flags; int ret = 0, notify_event = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_IDLE; notify_event = 1; break; case IW_CM_STATE_DESTROYING: break; default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id_priv->id.device->ops.iw_rem_ref(qp); if (notify_event) ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); return ret; } static int process_event(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { int ret = 0; switch (iw_event->event) { case IW_CM_EVENT_CONNECT_REQUEST: cm_conn_req_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CONNECT_REPLY: ret = cm_conn_rep_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_ESTABLISHED: ret = cm_conn_est_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_DISCONNECT: cm_disconnect_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CLOSE: ret = cm_close_handler(cm_id_priv, iw_event); break; default: BUG(); } return ret; } /* * Process events on the work_list for the cm_id. If the callback * function requests that the cm_id be deleted, a flag is set in the * cm_id flags to indicate that when the last reference is * removed, the cm_id is to be destroyed. This is necessary to * distinguish between an object that will be destroyed by the app * thread asleep on the destroy_comp list vs. an object destroyed * here synchronously when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { struct iwcm_work *work = container_of(_work, struct iwcm_work, work); struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); while (!list_empty(&cm_id_priv->work_list)) { work = list_first_entry(&cm_id_priv->work_list, struct iwcm_work, list); list_del_init(&work->list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); if (ret) WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id)); } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into * the CM and/or block. Events are queued to a per-CM_ID * work_list. If this is the first event on the work_list, the work * element is also queued on the iwcm_wq thread. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be * deleted. * * Returns: * 0 - the event was handled. * -ENOMEM - the event was not handled due to lack of resources. */ static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct iwcm_work *work; struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); work = get_work(cm_id_priv); if (!work) { ret = -ENOMEM; goto out; } INIT_WORK(&work->work, cm_work_handler); work->cm_id = cm_id_priv; work->event = *iw_event; if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || work->event.event == IW_CM_EVENT_CONNECT_REPLY) && work->event.private_data_len) { ret = copy_private_data(&work->event); if (ret) { put_work(work); goto out; } } refcount_inc(&cm_id_priv->refcount); list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = 0; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct iwcm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: case IB_QPS_RTR: ret = iwcm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = iwcm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(iw_cm_init_qp_attr); static int __init iw_cm_init(void) { int ret; ret = iwpm_init(RDMA_NL_IWCM); if (ret) return ret; iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) goto err_alloc; iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", iwcm_ctl_table); if (!iwcm_ctl_table_hdr) { pr_err("iw_cm: couldn't register sysctl paths\n"); goto err_sysctl; } rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); return 0; err_sysctl: destroy_workqueue(iwcm_wq); err_alloc: iwpm_exit(RDMA_NL_IWCM); return -ENOMEM; } static void __exit iw_cm_cleanup(void) { rdma_nl_unregister(RDMA_NL_IWCM); unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); iwpm_exit(RDMA_NL_IWCM); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); module_init(iw_cm_init); module_exit(iw_cm_cleanup);
43 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/timer.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/skbuff.h> #include <linux/export.h> #include "dccp.h" /* sysctl variables governing numbers of retransmission attempts */ int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; static void dccp_write_err(struct sock *sk) { sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; sk_error_report(sk); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); } /* A write timeout has occurred. Process the after effects. */ static int dccp_write_timeout(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { if (icsk->icsk_retransmits != 0) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_dccp_request_retries; } else { if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( It is place to make it. It is not made. I do not want to make it. It is disguisting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: "The one security concern raised by this memo is that ICMP black holes are often caused by over-zealous security administrators who block all ICMP messages. It is vitally important that those who design and deploy security systems understand the impact of strict filtering on upper-layer protocols. The safest web site in the world is worthless if most TCP implementations cannot transfer data from it. It would be far nicer to have all of the black holes fixed rather than fixing all of the TCP implementations." Golden words :-). */ dst_negative_advice(sk); } retry_until = sysctl_dccp_retries2; /* * FIXME: see tcp_write_timout and tcp_out_of_resources */ } if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ dccp_write_err(sk); return 1; } return 0; } /* * The DCCP retransmit timer. */ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* * More than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. */ if (dccp_write_timeout(sk)) return; /* * We want to know the number of packets retransmitted, not the * total number of retransmissions of clones of original packets. */ if (icsk->icsk_retransmits == 0) __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); if (dccp_retransmit_skb(sk) != 0) { /* * Retransmission failed because of local congestion, * do not backoff. */ if (--icsk->icsk_retransmits == 0) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), DCCP_RTO_MAX); return; } icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); if (icsk->icsk_retransmits > sysctl_dccp_retries1) __sk_dst_reset(sk); } static void dccp_write_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; int event = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out; } if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) goto out; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } event = icsk->icsk_pending; icsk->icsk_pending = 0; switch (event) { case ICSK_TIME_RETRANS: dccp_retransmit_timer(sk); break; } out: bh_unlock_sock(sk); sock_put(sk); } static void dccp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ static void dccp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out; } if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } dccp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: bh_unlock_sock(sk); sock_put(sk); } /** * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface * @t: pointer to the tasklet associated with this handler * * See the comments above %ccid_dequeueing_decision for supported modes. */ static void dccp_write_xmitlet(struct tasklet_struct *t) { struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else dccp_write_xmit(sk); bh_unlock_sock(sk); sock_put(sk); } static void dccp_write_xmit_timer(struct timer_list *t) { struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); dccp_write_xmitlet(&dp->dccps_xmitlet); } void dccp_init_xmit_timers(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } static ktime_t dccp_timestamp_seed; /** * dccp_timestamp - 10s of microseconds time source * Returns the number of 10s of microseconds since loading DCCP. This is native * DCCP time difference format (RFC 4340, sec. 13). * Please note: This will wrap around about circa every 11.9 hours. */ u32 dccp_timestamp(void) { u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); do_div(delta, 10); return delta; } EXPORT_SYMBOL_GPL(dccp_timestamp); void __init dccp_timestamping_init(void) { dccp_timestamp_seed = ktime_get_real(); }
9 8 3 4 1 7 7 7 6 6 11 11 11 11 11 6 11 4 4 4 1 4 3 1 8 1 1 3 2 6 2 2 3 11 11 4 2 11 10 1 1 1 1 1 2 4 9 9 9 1 9 6 6 6 6 6 24 21 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-1.0+ /* generic HDLC line discipline for Linux * * Written by Paul Fulghum paulkf@microgate.com * for Microgate Corporation * * Microgate and SyncLink are registered trademarks of Microgate Corporation * * Adapted from ppp.c, written by Michael Callahan <callahan@maths.ox.ac.uk>, * Al Longyear <longyear@netcom.com>, * Paul Mackerras <Paul.Mackerras@cs.anu.edu.au> * * Original release 01/11/99 * * This module implements the tty line discipline N_HDLC for use with * tty device drivers that support bit-synchronous HDLC communications. * * All HDLC data is frame oriented which means: * * 1. tty write calls represent one complete transmit frame of data * The device driver should accept the complete frame or none of * the frame (busy) in the write method. Each write call should have * a byte count in the range of 2-65535 bytes (2 is min HDLC frame * with 1 addr byte and 1 ctrl byte). The max byte count of 65535 * should include any crc bytes required. For example, when using * CCITT CRC32, 4 crc bytes are required, so the maximum size frame * the application may transmit is limited to 65531 bytes. For CCITT * CRC16, the maximum application frame size would be 65533. * * * 2. receive callbacks from the device driver represents * one received frame. The device driver should bypass * the tty flip buffer and call the line discipline receive * callback directly to avoid fragmenting or concatenating * multiple frames into a single receive callback. * * The HDLC line discipline queues the receive frames in separate * buffers so complete receive frames can be returned by the * tty read calls. * * 3. tty read calls returns an entire frame of data or nothing. * * 4. all send and receive data is considered raw. No processing * or translation is performed by the line discipline, regardless * of the tty flags * * 5. When line discipline is queried for the amount of receive * data available (FIOC), 0 is returned if no data available, * otherwise the count of the next available frame is returned. * (instead of the sum of all received frame counts). * * These conventions allow the standard tty programming interface * to be used for synchronous HDLC applications when used with * this line discipline (or another line discipline that is frame * oriented such as N_PPP). * * The SyncLink driver (synclink.c) implements both asynchronous * (using standard line discipline N_TTY) and synchronous HDLC * (using N_HDLC) communications, with the latter using the above * conventions. * * This implementation is very basic and does not maintain * any statistics. The main point is to enforce the raw data * and frame orientation of HDLC communications. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/poll.h> #include <linux/in.h> #include <linux/ioctl.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/string.h> /* used in new tty drivers */ #include <linux/signal.h> /* used in new tty drivers */ #include <linux/if.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include "tty.h" /* * Buffers for individual HDLC frames */ #define MAX_HDLC_FRAME_SIZE 65535 #define DEFAULT_RX_BUF_COUNT 10 #define MAX_RX_BUF_COUNT 60 #define DEFAULT_TX_BUF_COUNT 3 struct n_hdlc_buf { struct list_head list_item; size_t count; u8 buf[]; }; struct n_hdlc_buf_list { struct list_head list; int count; spinlock_t spinlock; }; /** * struct n_hdlc - per device instance data structure * @tbusy: reentrancy flag for tx wakeup code * @woke_up: tx wakeup needs to be run again as it was called while @tbusy * @tx_buf_list: list of pending transmit frame buffers * @rx_buf_list: list of received frame buffers * @tx_free_buf_list: list unused transmit frame buffers * @rx_free_buf_list: list unused received frame buffers */ struct n_hdlc { bool tbusy; bool woke_up; struct n_hdlc_buf_list tx_buf_list; struct n_hdlc_buf_list rx_buf_list; struct n_hdlc_buf_list tx_free_buf_list; struct n_hdlc_buf_list rx_free_buf_list; struct work_struct write_work; struct tty_struct *tty_for_write_work; }; /* * HDLC buffer list manipulation functions */ static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf); static void n_hdlc_buf_put(struct n_hdlc_buf_list *list, struct n_hdlc_buf *buf); static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *list); /* Local functions */ static struct n_hdlc *n_hdlc_alloc(void); static void n_hdlc_tty_write_work(struct work_struct *work); /* max frame size for memory allocations */ static int maxframe = 4096; static void flush_rx_queue(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; struct n_hdlc_buf *buf; while ((buf = n_hdlc_buf_get(&n_hdlc->rx_buf_list))) n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, buf); } static void flush_tx_queue(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; struct n_hdlc_buf *buf; while ((buf = n_hdlc_buf_get(&n_hdlc->tx_buf_list))) n_hdlc_buf_put(&n_hdlc->tx_free_buf_list, buf); } static void n_hdlc_free_buf_list(struct n_hdlc_buf_list *list) { struct n_hdlc_buf *buf; do { buf = n_hdlc_buf_get(list); kfree(buf); } while (buf); } /** * n_hdlc_tty_close - line discipline close * @tty: pointer to tty info structure * * Called when the line discipline is changed to something * else, the tty is closed, or the tty detects a hangup. */ static void n_hdlc_tty_close(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; #if defined(TTY_NO_WRITE_SPLIT) clear_bit(TTY_NO_WRITE_SPLIT, &tty->flags); #endif tty->disc_data = NULL; /* Ensure that the n_hdlcd process is not hanging on select()/poll() */ wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); cancel_work_sync(&n_hdlc->write_work); n_hdlc_free_buf_list(&n_hdlc->rx_free_buf_list); n_hdlc_free_buf_list(&n_hdlc->tx_free_buf_list); n_hdlc_free_buf_list(&n_hdlc->rx_buf_list); n_hdlc_free_buf_list(&n_hdlc->tx_buf_list); kfree(n_hdlc); } /* end of n_hdlc_tty_close() */ /** * n_hdlc_tty_open - called when line discipline changed to n_hdlc * @tty: pointer to tty info structure * * Returns 0 if success, otherwise error code */ static int n_hdlc_tty_open(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; pr_debug("%s() called (device=%s)\n", __func__, tty->name); /* There should not be an existing table for this slot. */ if (n_hdlc) { pr_err("%s: tty already associated!\n", __func__); return -EEXIST; } n_hdlc = n_hdlc_alloc(); if (!n_hdlc) { pr_err("%s: n_hdlc_alloc failed\n", __func__); return -ENFILE; } INIT_WORK(&n_hdlc->write_work, n_hdlc_tty_write_work); n_hdlc->tty_for_write_work = tty; tty->disc_data = n_hdlc; tty->receive_room = 65536; /* change tty_io write() to not split large writes into 8K chunks */ set_bit(TTY_NO_WRITE_SPLIT, &tty->flags); /* flush receive data from driver */ tty_driver_flush_buffer(tty); return 0; } /* end of n_tty_hdlc_open() */ /** * n_hdlc_send_frames - send frames on pending send buffer list * @n_hdlc: pointer to ldisc instance data * @tty: pointer to tty instance data * * Send frames on pending send buffer list until the driver does not accept a * frame (busy) this function is called after adding a frame to the send buffer * list and by the tty wakeup callback. */ static void n_hdlc_send_frames(struct n_hdlc *n_hdlc, struct tty_struct *tty) { unsigned long flags; struct n_hdlc_buf *tbuf; ssize_t actual; check_again: spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); if (n_hdlc->tbusy) { n_hdlc->woke_up = true; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); return; } n_hdlc->tbusy = true; n_hdlc->woke_up = false; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); tbuf = n_hdlc_buf_get(&n_hdlc->tx_buf_list); while (tbuf) { pr_debug("sending frame %p, count=%zu\n", tbuf, tbuf->count); /* Send the next block of data to device */ set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); actual = tty->ops->write(tty, tbuf->buf, tbuf->count); /* rollback was possible and has been done */ if (actual == -ERESTARTSYS) { n_hdlc_buf_return(&n_hdlc->tx_buf_list, tbuf); break; } /* if transmit error, throw frame away by */ /* pretending it was accepted by driver */ if (actual < 0) actual = tbuf->count; if (actual == tbuf->count) { pr_debug("frame %p completed\n", tbuf); /* free current transmit buffer */ n_hdlc_buf_put(&n_hdlc->tx_free_buf_list, tbuf); /* wait up sleeping writers */ wake_up_interruptible(&tty->write_wait); /* get next pending transmit buffer */ tbuf = n_hdlc_buf_get(&n_hdlc->tx_buf_list); } else { pr_debug("frame %p pending\n", tbuf); /* * the buffer was not accepted by driver, * return it back into tx queue */ n_hdlc_buf_return(&n_hdlc->tx_buf_list, tbuf); break; } } if (!tbuf) clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); /* Clear the re-entry flag */ spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); n_hdlc->tbusy = false; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); if (n_hdlc->woke_up) goto check_again; } /* end of n_hdlc_send_frames() */ /** * n_hdlc_tty_write_work - Asynchronous callback for transmit wakeup * @work: pointer to work_struct * * Called when low level device driver can accept more send data. */ static void n_hdlc_tty_write_work(struct work_struct *work) { struct n_hdlc *n_hdlc = container_of(work, struct n_hdlc, write_work); struct tty_struct *tty = n_hdlc->tty_for_write_work; n_hdlc_send_frames(n_hdlc, tty); } /* end of n_hdlc_tty_write_work() */ /** * n_hdlc_tty_wakeup - Callback for transmit wakeup * @tty: pointer to associated tty instance data * * Called when low level device driver can accept more send data. */ static void n_hdlc_tty_wakeup(struct tty_struct *tty) { struct n_hdlc *n_hdlc = tty->disc_data; schedule_work(&n_hdlc->write_work); } /* end of n_hdlc_tty_wakeup() */ /** * n_hdlc_tty_receive - Called by tty driver when receive data is available * @tty: pointer to tty instance data * @data: pointer to received data * @flags: pointer to flags for data * @count: count of received data in bytes * * Called by tty low level driver when receive data is available. Data is * interpreted as one HDLC frame. */ static void n_hdlc_tty_receive(struct tty_struct *tty, const u8 *data, const u8 *flags, size_t count) { register struct n_hdlc *n_hdlc = tty->disc_data; register struct n_hdlc_buf *buf; pr_debug("%s() called count=%zu\n", __func__, count); if (count > maxframe) { pr_debug("rx count>maxframesize, data discarded\n"); return; } /* get a free HDLC buffer */ buf = n_hdlc_buf_get(&n_hdlc->rx_free_buf_list); if (!buf) { /* * no buffers in free list, attempt to allocate another rx * buffer unless the maximum count has been reached */ if (n_hdlc->rx_buf_list.count < MAX_RX_BUF_COUNT) buf = kmalloc(struct_size(buf, buf, maxframe), GFP_ATOMIC); } if (!buf) { pr_debug("no more rx buffers, data discarded\n"); return; } /* copy received data to HDLC buffer */ memcpy(buf->buf, data, count); buf->count = count; /* add HDLC buffer to list of received frames */ n_hdlc_buf_put(&n_hdlc->rx_buf_list, buf); /* wake up any blocked reads and perform async signalling */ wake_up_interruptible(&tty->read_wait); if (tty->fasync != NULL) kill_fasync(&tty->fasync, SIGIO, POLL_IN); } /* end of n_hdlc_tty_receive() */ /** * n_hdlc_tty_read - Called to retrieve one frame of data (if available) * @tty: pointer to tty instance data * @file: pointer to open file object * @kbuf: pointer to returned data buffer * @nr: size of returned data buffer * @cookie: stored rbuf from previous run * @offset: offset into the data buffer * * Returns the number of bytes returned or error code. */ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file, u8 *kbuf, size_t nr, void **cookie, unsigned long offset) { struct n_hdlc *n_hdlc = tty->disc_data; int ret = 0; struct n_hdlc_buf *rbuf; DECLARE_WAITQUEUE(wait, current); /* Is this a repeated call for an rbuf we already found earlier? */ rbuf = *cookie; if (rbuf) goto have_rbuf; add_wait_queue(&tty->read_wait, &wait); for (;;) { if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) { ret = -EIO; break; } if (tty_hung_up_p(file)) break; set_current_state(TASK_INTERRUPTIBLE); rbuf = n_hdlc_buf_get(&n_hdlc->rx_buf_list); if (rbuf) break; /* no data */ if (tty_io_nonblock(tty, file)) { ret = -EAGAIN; break; } schedule(); if (signal_pending(current)) { ret = -EINTR; break; } } remove_wait_queue(&tty->read_wait, &wait); __set_current_state(TASK_RUNNING); if (!rbuf) return ret; *cookie = rbuf; have_rbuf: /* Have we used it up entirely? */ if (offset >= rbuf->count) goto done_with_rbuf; /* More data to go, but can't copy any more? EOVERFLOW */ ret = -EOVERFLOW; if (!nr) goto done_with_rbuf; /* Copy as much data as possible */ ret = rbuf->count - offset; if (ret > nr) ret = nr; memcpy(kbuf, rbuf->buf+offset, ret); offset += ret; /* If we still have data left, we leave the rbuf in the cookie */ if (offset < rbuf->count) return ret; done_with_rbuf: *cookie = NULL; if (n_hdlc->rx_free_buf_list.count > DEFAULT_RX_BUF_COUNT) kfree(rbuf); else n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, rbuf); return ret; } /* end of n_hdlc_tty_read() */ /** * n_hdlc_tty_write - write a single frame of data to device * @tty: pointer to associated tty device instance data * @file: pointer to file object data * @data: pointer to transmit data (one frame) * @count: size of transmit frame in bytes * * Returns the number of bytes written (or error code). */ static ssize_t n_hdlc_tty_write(struct tty_struct *tty, struct file *file, const u8 *data, size_t count) { struct n_hdlc *n_hdlc = tty->disc_data; DECLARE_WAITQUEUE(wait, current); struct n_hdlc_buf *tbuf; ssize_t error = 0; pr_debug("%s() called count=%zd\n", __func__, count); /* verify frame size */ if (count > maxframe) { pr_debug("%s: truncating user packet from %zu to %d\n", __func__, count, maxframe); count = maxframe; } add_wait_queue(&tty->write_wait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); tbuf = n_hdlc_buf_get(&n_hdlc->tx_free_buf_list); if (tbuf) break; if (tty_io_nonblock(tty, file)) { error = -EAGAIN; break; } schedule(); if (signal_pending(current)) { error = -EINTR; break; } } __set_current_state(TASK_RUNNING); remove_wait_queue(&tty->write_wait, &wait); if (!error) { /* Retrieve the user's buffer */ memcpy(tbuf->buf, data, count); /* Send the data */ tbuf->count = error = count; n_hdlc_buf_put(&n_hdlc->tx_buf_list, tbuf); n_hdlc_send_frames(n_hdlc, tty); } return error; } /* end of n_hdlc_tty_write() */ /** * n_hdlc_tty_ioctl - process IOCTL system call for the tty device. * @tty: pointer to tty instance data * @cmd: IOCTL command code * @arg: argument for IOCTL call (cmd dependent) * * Returns command dependent result. */ static int n_hdlc_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct n_hdlc *n_hdlc = tty->disc_data; int error = 0; int count; unsigned long flags; struct n_hdlc_buf *buf = NULL; pr_debug("%s() called %d\n", __func__, cmd); switch (cmd) { case FIONREAD: /* report count of read data available */ /* in next available frame (if any) */ spin_lock_irqsave(&n_hdlc->rx_buf_list.spinlock, flags); buf = list_first_entry_or_null(&n_hdlc->rx_buf_list.list, struct n_hdlc_buf, list_item); if (buf) count = buf->count; else count = 0; spin_unlock_irqrestore(&n_hdlc->rx_buf_list.spinlock, flags); error = put_user(count, (int __user *)arg); break; case TIOCOUTQ: /* get the pending tx byte count in the driver */ count = tty_chars_in_buffer(tty); /* add size of next output frame in queue */ spin_lock_irqsave(&n_hdlc->tx_buf_list.spinlock, flags); buf = list_first_entry_or_null(&n_hdlc->tx_buf_list.list, struct n_hdlc_buf, list_item); if (buf) count += buf->count; spin_unlock_irqrestore(&n_hdlc->tx_buf_list.spinlock, flags); error = put_user(count, (int __user *)arg); break; case TCFLSH: switch (arg) { case TCIOFLUSH: case TCOFLUSH: flush_tx_queue(tty); } fallthrough; /* to default */ default: error = n_tty_ioctl_helper(tty, cmd, arg); break; } return error; } /* end of n_hdlc_tty_ioctl() */ /** * n_hdlc_tty_poll - TTY callback for poll system call * @tty: pointer to tty instance data * @filp: pointer to open file object for device * @wait: wait queue for operations * * Determine which operations (read/write) will not block and return info * to caller. * Returns a bit mask containing info on which ops will not block. */ static __poll_t n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp, poll_table *wait) { struct n_hdlc *n_hdlc = tty->disc_data; __poll_t mask = 0; /* * queue the current process into any wait queue that may awaken in the * future (read and write) */ poll_wait(filp, &tty->read_wait, wait); poll_wait(filp, &tty->write_wait, wait); /* set bits for operations that won't block */ if (!list_empty(&n_hdlc->rx_buf_list.list)) mask |= EPOLLIN | EPOLLRDNORM; /* readable */ if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) mask |= EPOLLHUP; if (tty_hung_up_p(filp)) mask |= EPOLLHUP; if (!tty_is_writelocked(tty) && !list_empty(&n_hdlc->tx_free_buf_list.list)) mask |= EPOLLOUT | EPOLLWRNORM; /* writable */ return mask; } /* end of n_hdlc_tty_poll() */ static void n_hdlc_alloc_buf(struct n_hdlc_buf_list *list, unsigned int count, const char *name) { struct n_hdlc_buf *buf; unsigned int i; for (i = 0; i < count; i++) { buf = kmalloc(struct_size(buf, buf, maxframe), GFP_KERNEL); if (!buf) { pr_debug("%s(), kmalloc() failed for %s buffer %u\n", __func__, name, i); return; } n_hdlc_buf_put(list, buf); } } /** * n_hdlc_alloc - allocate an n_hdlc instance data structure * * Returns a pointer to newly created structure if success, otherwise %NULL */ static struct n_hdlc *n_hdlc_alloc(void) { struct n_hdlc *n_hdlc = kzalloc(sizeof(*n_hdlc), GFP_KERNEL); if (!n_hdlc) return NULL; spin_lock_init(&n_hdlc->rx_free_buf_list.spinlock); spin_lock_init(&n_hdlc->tx_free_buf_list.spinlock); spin_lock_init(&n_hdlc->rx_buf_list.spinlock); spin_lock_init(&n_hdlc->tx_buf_list.spinlock); INIT_LIST_HEAD(&n_hdlc->rx_free_buf_list.list); INIT_LIST_HEAD(&n_hdlc->tx_free_buf_list.list); INIT_LIST_HEAD(&n_hdlc->rx_buf_list.list); INIT_LIST_HEAD(&n_hdlc->tx_buf_list.list); n_hdlc_alloc_buf(&n_hdlc->rx_free_buf_list, DEFAULT_RX_BUF_COUNT, "rx"); n_hdlc_alloc_buf(&n_hdlc->tx_free_buf_list, DEFAULT_TX_BUF_COUNT, "tx"); return n_hdlc; } /* end of n_hdlc_alloc() */ /** * n_hdlc_buf_return - put the HDLC buffer after the head of the specified list * @buf_list: pointer to the buffer list * @buf: pointer to the buffer */ static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf) { unsigned long flags; spin_lock_irqsave(&buf_list->spinlock, flags); list_add(&buf->list_item, &buf_list->list); buf_list->count++; spin_unlock_irqrestore(&buf_list->spinlock, flags); } /** * n_hdlc_buf_put - add specified HDLC buffer to tail of specified list * @buf_list: pointer to buffer list * @buf: pointer to buffer */ static void n_hdlc_buf_put(struct n_hdlc_buf_list *buf_list, struct n_hdlc_buf *buf) { unsigned long flags; spin_lock_irqsave(&buf_list->spinlock, flags); list_add_tail(&buf->list_item, &buf_list->list); buf_list->count++; spin_unlock_irqrestore(&buf_list->spinlock, flags); } /* end of n_hdlc_buf_put() */ /** * n_hdlc_buf_get - remove and return an HDLC buffer from list * @buf_list: pointer to HDLC buffer list * * Remove and return an HDLC buffer from the head of the specified HDLC buffer * list. * Returns a pointer to HDLC buffer if available, otherwise %NULL. */ static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *buf_list) { unsigned long flags; struct n_hdlc_buf *buf; spin_lock_irqsave(&buf_list->spinlock, flags); buf = list_first_entry_or_null(&buf_list->list, struct n_hdlc_buf, list_item); if (buf) { list_del(&buf->list_item); buf_list->count--; } spin_unlock_irqrestore(&buf_list->spinlock, flags); return buf; } /* end of n_hdlc_buf_get() */ static struct tty_ldisc_ops n_hdlc_ldisc = { .owner = THIS_MODULE, .num = N_HDLC, .name = "hdlc", .open = n_hdlc_tty_open, .close = n_hdlc_tty_close, .read = n_hdlc_tty_read, .write = n_hdlc_tty_write, .ioctl = n_hdlc_tty_ioctl, .poll = n_hdlc_tty_poll, .receive_buf = n_hdlc_tty_receive, .write_wakeup = n_hdlc_tty_wakeup, .flush_buffer = flush_rx_queue, }; static int __init n_hdlc_init(void) { int status; /* range check maxframe arg */ maxframe = clamp(maxframe, 4096, MAX_HDLC_FRAME_SIZE); status = tty_register_ldisc(&n_hdlc_ldisc); if (!status) pr_info("N_HDLC line discipline registered with maxframe=%d\n", maxframe); else pr_err("N_HDLC: error registering line discipline: %d\n", status); return status; } /* end of init_module() */ static void __exit n_hdlc_exit(void) { tty_unregister_ldisc(&n_hdlc_ldisc); } module_init(n_hdlc_init); module_exit(n_hdlc_exit); MODULE_DESCRIPTION("HDLC line discipline support"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul Fulghum paulkf@microgate.com"); module_param(maxframe, int, 0); MODULE_ALIAS_LDISC(N_HDLC);
1 2 2 2 15 1 1 12 27 14 14 31 42 44 14 44 43 30 30 30 48 48 65 65 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 /* * Copyright (c) 2016 Laurent Pinchart <laurent.pinchart@ideasonboard.com> * * DRM core format related functions * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <linux/bug.h> #include <linux/ctype.h> #include <linux/export.h> #include <linux/kernel.h> #include <drm/drm_device.h> #include <drm/drm_fourcc.h> /** * drm_mode_legacy_fb_format - compute drm fourcc code from legacy description * @bpp: bits per pixels * @depth: bit depth per pixel * * Computes a drm fourcc pixel format code for the given @bpp/@depth values. */ uint32_t drm_mode_legacy_fb_format(uint32_t bpp, uint32_t depth) { uint32_t fmt = DRM_FORMAT_INVALID; switch (bpp) { case 1: if (depth == 1) fmt = DRM_FORMAT_C1; break; case 2: if (depth == 2) fmt = DRM_FORMAT_C2; break; case 4: if (depth == 4) fmt = DRM_FORMAT_C4; break; case 8: if (depth == 8) fmt = DRM_FORMAT_C8; break; case 16: switch (depth) { case 15: fmt = DRM_FORMAT_XRGB1555; break; case 16: fmt = DRM_FORMAT_RGB565; break; default: break; } break; case 24: if (depth == 24) fmt = DRM_FORMAT_RGB888; break; case 32: switch (depth) { case 24: fmt = DRM_FORMAT_XRGB8888; break; case 30: fmt = DRM_FORMAT_XRGB2101010; break; case 32: fmt = DRM_FORMAT_ARGB8888; break; default: break; } break; default: break; } return fmt; } EXPORT_SYMBOL(drm_mode_legacy_fb_format); /** * drm_driver_legacy_fb_format - compute drm fourcc code from legacy description * @dev: DRM device * @bpp: bits per pixels * @depth: bit depth per pixel * * Computes a drm fourcc pixel format code for the given @bpp/@depth values. * Unlike drm_mode_legacy_fb_format() this looks at the drivers mode_config, * and depending on the &drm_mode_config.quirk_addfb_prefer_host_byte_order flag * it returns little endian byte order or host byte order framebuffer formats. */ uint32_t drm_driver_legacy_fb_format(struct drm_device *dev, uint32_t bpp, uint32_t depth) { uint32_t fmt = drm_mode_legacy_fb_format(bpp, depth); if (dev->mode_config.quirk_addfb_prefer_host_byte_order) { if (fmt == DRM_FORMAT_XRGB8888) fmt = DRM_FORMAT_HOST_XRGB8888; if (fmt == DRM_FORMAT_ARGB8888) fmt = DRM_FORMAT_HOST_ARGB8888; if (fmt == DRM_FORMAT_RGB565) fmt = DRM_FORMAT_HOST_RGB565; if (fmt == DRM_FORMAT_XRGB1555) fmt = DRM_FORMAT_HOST_XRGB1555; } if (dev->mode_config.quirk_addfb_prefer_xbgr_30bpp && fmt == DRM_FORMAT_XRGB2101010) fmt = DRM_FORMAT_XBGR2101010; return fmt; } EXPORT_SYMBOL(drm_driver_legacy_fb_format); /** * drm_driver_color_mode_format - Compute DRM 4CC code from color mode * @dev: DRM device * @color_mode: command-line color mode * * Computes a DRM 4CC pixel format code for the given color mode using * drm_driver_color_mode(). The color mode is in the format used and the * kernel command line. It specifies the number of bits per pixel * and color depth in a single value. * * Useful in fbdev emulation code, since that deals in those values. The * helper does not consider YUV or other complicated formats. This means * only legacy formats are supported (fmt->depth is a legacy field), but * the framebuffer emulation can only deal with such formats, specifically * RGB/BGA formats. */ uint32_t drm_driver_color_mode_format(struct drm_device *dev, unsigned int color_mode) { switch (color_mode) { case 15: return drm_driver_legacy_fb_format(dev, 16, 15); case 32: return drm_driver_legacy_fb_format(dev, 32, 24); default: return drm_driver_legacy_fb_format(dev, color_mode, color_mode); } } EXPORT_SYMBOL(drm_driver_color_mode_format); /* * Internal function to query information for a given format. See * drm_format_info() for the public API. */ const struct drm_format_info *__drm_format_info(u32 format) { static const struct drm_format_info formats[] = { { .format = DRM_FORMAT_C1, .depth = 1, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 8, }, .block_h = { 1, }, .hsub = 1, .vsub = 1, .is_color_indexed = true }, { .format = DRM_FORMAT_C2, .depth = 2, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 4, }, .block_h = { 1, }, .hsub = 1, .vsub = 1, .is_color_indexed = true }, { .format = DRM_FORMAT_C4, .depth = 4, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 2, }, .block_h = { 1, }, .hsub = 1, .vsub = 1, .is_color_indexed = true }, { .format = DRM_FORMAT_C8, .depth = 8, .num_planes = 1, .cpp = { 1, 0, 0 }, .hsub = 1, .vsub = 1, .is_color_indexed = true }, { .format = DRM_FORMAT_D1, .depth = 1, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 8, }, .block_h = { 1, }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_D2, .depth = 2, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 4, }, .block_h = { 1, }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_D4, .depth = 4, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 2, }, .block_h = { 1, }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_D8, .depth = 8, .num_planes = 1, .cpp = { 1, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_R1, .depth = 1, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 8, }, .block_h = { 1, }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_R2, .depth = 2, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 4, }, .block_h = { 1, }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_R4, .depth = 4, .num_planes = 1, .char_per_block = { 1, }, .block_w = { 2, }, .block_h = { 1, }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_R8, .depth = 8, .num_planes = 1, .cpp = { 1, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_R10, .depth = 10, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_R12, .depth = 12, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGB332, .depth = 8, .num_planes = 1, .cpp = { 1, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGR233, .depth = 8, .num_planes = 1, .cpp = { 1, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XRGB4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XBGR4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGBX4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGRX4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_ARGB4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ABGR4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGBA4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGRA4444, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_XRGB1555, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XBGR1555, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGBX5551, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGRX5551, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_ARGB1555, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ABGR1555, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGBA5551, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGRA5551, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGB565, .depth = 16, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGR565, .depth = 16, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, #ifdef __BIG_ENDIAN { .format = DRM_FORMAT_XRGB1555 | DRM_FORMAT_BIG_ENDIAN, .depth = 15, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGB565 | DRM_FORMAT_BIG_ENDIAN, .depth = 16, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 1, .vsub = 1 }, #endif { .format = DRM_FORMAT_RGB888, .depth = 24, .num_planes = 1, .cpp = { 3, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGR888, .depth = 24, .num_planes = 1, .cpp = { 3, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XRGB8888, .depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XBGR8888, .depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGBX8888, .depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGRX8888, .depth = 24, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGB565_A8, .depth = 24, .num_planes = 2, .cpp = { 2, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGR565_A8, .depth = 24, .num_planes = 2, .cpp = { 2, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_XRGB2101010, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XBGR2101010, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_RGBX1010102, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_BGRX1010102, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_ARGB2101010, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ABGR2101010, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGBA1010102, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGRA1010102, .depth = 30, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ARGB8888, .depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ABGR8888, .depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGBA8888, .depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGRA8888, .depth = 32, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_XRGB16161616F, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XBGR16161616F, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_ARGB16161616F, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ABGR16161616F, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_AXBXGXRX106106106106, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_XRGB16161616, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_XBGR16161616, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1 }, { .format = DRM_FORMAT_ARGB16161616, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_ABGR16161616, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGB888_A8, .depth = 32, .num_planes = 2, .cpp = { 3, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGR888_A8, .depth = 32, .num_planes = 2, .cpp = { 3, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_XRGB8888_A8, .depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_XBGR8888_A8, .depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_RGBX8888_A8, .depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_BGRX8888_A8, .depth = 32, .num_planes = 2, .cpp = { 4, 1, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true }, { .format = DRM_FORMAT_YUV410, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 4, .is_yuv = true }, { .format = DRM_FORMAT_YVU410, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 4, .is_yuv = true }, { .format = DRM_FORMAT_YUV411, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YVU411, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 4, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YUV420, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_YVU420, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_YUV422, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YVU422, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YUV444, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YVU444, .depth = 0, .num_planes = 3, .cpp = { 1, 1, 1 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_NV12, .depth = 0, .num_planes = 2, .cpp = { 1, 2, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_NV21, .depth = 0, .num_planes = 2, .cpp = { 1, 2, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_NV16, .depth = 0, .num_planes = 2, .cpp = { 1, 2, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_NV61, .depth = 0, .num_planes = 2, .cpp = { 1, 2, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_NV24, .depth = 0, .num_planes = 2, .cpp = { 1, 2, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_NV42, .depth = 0, .num_planes = 2, .cpp = { 1, 2, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YUYV, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YVYU, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_UYVY, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_VYUY, .depth = 0, .num_planes = 1, .cpp = { 2, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_XYUV8888, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_VUY888, .depth = 0, .num_planes = 1, .cpp = { 3, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_AYUV, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true, .is_yuv = true }, { .format = DRM_FORMAT_Y210, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_Y212, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_Y216, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_Y410, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true, .is_yuv = true }, { .format = DRM_FORMAT_Y412, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true, .is_yuv = true }, { .format = DRM_FORMAT_Y416, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .has_alpha = true, .is_yuv = true }, { .format = DRM_FORMAT_XVYU2101010, .depth = 0, .num_planes = 1, .cpp = { 4, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_XVYU12_16161616, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_XVYU16161616, .depth = 0, .num_planes = 1, .cpp = { 8, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_Y0L0, .depth = 0, .num_planes = 1, .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 }, .hsub = 2, .vsub = 2, .has_alpha = true, .is_yuv = true }, { .format = DRM_FORMAT_X0L0, .depth = 0, .num_planes = 1, .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_Y0L2, .depth = 0, .num_planes = 1, .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 }, .hsub = 2, .vsub = 2, .has_alpha = true, .is_yuv = true }, { .format = DRM_FORMAT_X0L2, .depth = 0, .num_planes = 1, .char_per_block = { 8, 0, 0 }, .block_w = { 2, 0, 0 }, .block_h = { 2, 0, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_P010, .depth = 0, .num_planes = 2, .char_per_block = { 2, 4, 0 }, .block_w = { 1, 1, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true}, { .format = DRM_FORMAT_P012, .depth = 0, .num_planes = 2, .char_per_block = { 2, 4, 0 }, .block_w = { 1, 1, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true}, { .format = DRM_FORMAT_P016, .depth = 0, .num_planes = 2, .char_per_block = { 2, 4, 0 }, .block_w = { 1, 1, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true}, { .format = DRM_FORMAT_P210, .depth = 0, .num_planes = 2, .char_per_block = { 2, 4, 0 }, .block_w = { 1, 1, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_VUY101010, .depth = 0, .num_planes = 1, .cpp = { 0, 0, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_YUV420_8BIT, .depth = 0, .num_planes = 1, .cpp = { 0, 0, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_YUV420_10BIT, .depth = 0, .num_planes = 1, .cpp = { 0, 0, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_NV15, .depth = 0, .num_planes = 2, .char_per_block = { 5, 5, 0 }, .block_w = { 4, 2, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true }, { .format = DRM_FORMAT_NV20, .depth = 0, .num_planes = 2, .char_per_block = { 5, 5, 0 }, .block_w = { 4, 2, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_NV30, .depth = 0, .num_planes = 2, .char_per_block = { 5, 5, 0 }, .block_w = { 4, 2, 0 }, .block_h = { 1, 1, 0 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_Q410, .depth = 0, .num_planes = 3, .char_per_block = { 2, 2, 2 }, .block_w = { 1, 1, 1 }, .block_h = { 1, 1, 1 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_Q401, .depth = 0, .num_planes = 3, .char_per_block = { 2, 2, 2 }, .block_w = { 1, 1, 1 }, .block_h = { 1, 1, 1 }, .hsub = 1, .vsub = 1, .is_yuv = true }, { .format = DRM_FORMAT_P030, .depth = 0, .num_planes = 2, .char_per_block = { 4, 8, 0 }, .block_w = { 3, 3, 0 }, .block_h = { 1, 1, 0 }, .hsub = 2, .vsub = 2, .is_yuv = true}, }; unsigned int i; for (i = 0; i < ARRAY_SIZE(formats); ++i) { if (formats[i].format == format) return &formats[i]; } return NULL; } /** * drm_format_info - query information for a given format * @format: pixel format (DRM_FORMAT_*) * * The caller should only pass a supported pixel format to this function. * Unsupported pixel formats will generate a warning in the kernel log. * * Returns: * The instance of struct drm_format_info that describes the pixel format, or * NULL if the format is unsupported. */ const struct drm_format_info *drm_format_info(u32 format) { const struct drm_format_info *info; info = __drm_format_info(format); WARN_ON(!info); return info; } EXPORT_SYMBOL(drm_format_info); /** * drm_get_format_info - query information for a given framebuffer configuration * @dev: DRM device * @mode_cmd: metadata from the userspace fb creation request * * Returns: * The instance of struct drm_format_info that describes the pixel format, or * NULL if the format is unsupported. */ const struct drm_format_info * drm_get_format_info(struct drm_device *dev, const struct drm_mode_fb_cmd2 *mode_cmd) { const struct drm_format_info *info = NULL; if (dev->mode_config.funcs->get_format_info) info = dev->mode_config.funcs->get_format_info(mode_cmd); if (!info) info = drm_format_info(mode_cmd->pixel_format); return info; } EXPORT_SYMBOL(drm_get_format_info); /** * drm_format_info_block_width - width in pixels of block. * @info: pixel format info * @plane: plane index * * Returns: * The width in pixels of a block, depending on the plane index. */ unsigned int drm_format_info_block_width(const struct drm_format_info *info, int plane) { if (!info || plane < 0 || plane >= info->num_planes) return 0; if (!info->block_w[plane]) return 1; return info->block_w[plane]; } EXPORT_SYMBOL(drm_format_info_block_width); /** * drm_format_info_block_height - height in pixels of a block * @info: pixel format info * @plane: plane index * * Returns: * The height in pixels of a block, depending on the plane index. */ unsigned int drm_format_info_block_height(const struct drm_format_info *info, int plane) { if (!info || plane < 0 || plane >= info->num_planes) return 0; if (!info->block_h[plane]) return 1; return info->block_h[plane]; } EXPORT_SYMBOL(drm_format_info_block_height); /** * drm_format_info_bpp - number of bits per pixel * @info: pixel format info * @plane: plane index * * Returns: * The actual number of bits per pixel, depending on the plane index. */ unsigned int drm_format_info_bpp(const struct drm_format_info *info, int plane) { if (!info || plane < 0 || plane >= info->num_planes) return 0; return info->char_per_block[plane] * 8 / (drm_format_info_block_width(info, plane) * drm_format_info_block_height(info, plane)); } EXPORT_SYMBOL(drm_format_info_bpp); /** * drm_format_info_min_pitch - computes the minimum required pitch in bytes * @info: pixel format info * @plane: plane index * @buffer_width: buffer width in pixels * * Returns: * The minimum required pitch in bytes for a buffer by taking into consideration * the pixel format information and the buffer width. */ uint64_t drm_format_info_min_pitch(const struct drm_format_info *info, int plane, unsigned int buffer_width) { if (!info || plane < 0 || plane >= info->num_planes) return 0; return DIV_ROUND_UP_ULL((u64)buffer_width * info->char_per_block[plane], drm_format_info_block_width(info, plane) * drm_format_info_block_height(info, plane)); } EXPORT_SYMBOL(drm_format_info_min_pitch);
2 2 2 2 3 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for "8 devices" USB2CAN converter * * Copyright (C) 2012 Bernd Krumboeck (krumboeck@universalnet.at) * * This driver is inspired by the 3.2.0 version of drivers/net/can/usb/ems_usb.c * and drivers/net/can/usb/esd_usb2.c * * Many thanks to Gerhard Bertelsmann (info@gerhard-bertelsmann.de) * for testing and fixing this driver. Also many thanks to "8 devices", * who were very cooperative and answered my questions. */ #include <linux/ethtool.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/usb.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> /* driver constants */ #define MAX_RX_URBS 20 #define MAX_TX_URBS 20 #define RX_BUFFER_SIZE 64 /* vendor and product id */ #define USB_8DEV_VENDOR_ID 0x0483 #define USB_8DEV_PRODUCT_ID 0x1234 /* endpoints */ enum usb_8dev_endpoint { USB_8DEV_ENDP_DATA_RX = 1, USB_8DEV_ENDP_DATA_TX, USB_8DEV_ENDP_CMD_RX, USB_8DEV_ENDP_CMD_TX }; /* device CAN clock */ #define USB_8DEV_ABP_CLOCK 32000000 /* setup flags */ #define USB_8DEV_SILENT 0x01 #define USB_8DEV_LOOPBACK 0x02 #define USB_8DEV_DISABLE_AUTO_RESTRANS 0x04 #define USB_8DEV_STATUS_FRAME 0x08 /* commands */ enum usb_8dev_cmd { USB_8DEV_RESET = 1, USB_8DEV_OPEN, USB_8DEV_CLOSE, USB_8DEV_SET_SPEED, USB_8DEV_SET_MASK_FILTER, USB_8DEV_GET_STATUS, USB_8DEV_GET_STATISTICS, USB_8DEV_GET_SERIAL, USB_8DEV_GET_SOFTW_VER, USB_8DEV_GET_HARDW_VER, USB_8DEV_RESET_TIMESTAMP, USB_8DEV_GET_SOFTW_HARDW_VER }; /* command options */ #define USB_8DEV_BAUD_MANUAL 0x09 #define USB_8DEV_CMD_START 0x11 #define USB_8DEV_CMD_END 0x22 #define USB_8DEV_CMD_SUCCESS 0 #define USB_8DEV_CMD_ERROR 255 #define USB_8DEV_CMD_TIMEOUT 1000 /* frames */ #define USB_8DEV_DATA_START 0x55 #define USB_8DEV_DATA_END 0xAA #define USB_8DEV_TYPE_CAN_FRAME 0 #define USB_8DEV_TYPE_ERROR_FRAME 3 #define USB_8DEV_EXTID 0x01 #define USB_8DEV_RTR 0x02 #define USB_8DEV_ERR_FLAG 0x04 /* status */ #define USB_8DEV_STATUSMSG_OK 0x00 /* Normal condition. */ #define USB_8DEV_STATUSMSG_OVERRUN 0x01 /* Overrun occurred when sending */ #define USB_8DEV_STATUSMSG_BUSLIGHT 0x02 /* Error counter has reached 96 */ #define USB_8DEV_STATUSMSG_BUSHEAVY 0x03 /* Error count. has reached 128 */ #define USB_8DEV_STATUSMSG_BUSOFF 0x04 /* Device is in BUSOFF */ #define USB_8DEV_STATUSMSG_STUFF 0x20 /* Stuff Error */ #define USB_8DEV_STATUSMSG_FORM 0x21 /* Form Error */ #define USB_8DEV_STATUSMSG_ACK 0x23 /* Ack Error */ #define USB_8DEV_STATUSMSG_BIT0 0x24 /* Bit1 Error */ #define USB_8DEV_STATUSMSG_BIT1 0x25 /* Bit0 Error */ #define USB_8DEV_STATUSMSG_CRC 0x27 /* CRC Error */ #define USB_8DEV_RP_MASK 0x7F /* Mask for Receive Error Bit */ /* table of devices that work with this driver */ static const struct usb_device_id usb_8dev_table[] = { { USB_DEVICE(USB_8DEV_VENDOR_ID, USB_8DEV_PRODUCT_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usb_8dev_table); struct usb_8dev_tx_urb_context { struct usb_8dev_priv *priv; u32 echo_index; }; /* Structure to hold all of our device specific stuff */ struct usb_8dev_priv { struct can_priv can; /* must be the first member */ struct usb_device *udev; struct net_device *netdev; atomic_t active_tx_urbs; struct usb_anchor tx_submitted; struct usb_8dev_tx_urb_context tx_contexts[MAX_TX_URBS]; struct usb_anchor rx_submitted; struct can_berr_counter bec; u8 *cmd_msg_buffer; struct mutex usb_8dev_cmd_lock; void *rxbuf[MAX_RX_URBS]; dma_addr_t rxbuf_dma[MAX_RX_URBS]; }; /* tx frame */ struct __packed usb_8dev_tx_msg { u8 begin; u8 flags; /* RTR and EXT_ID flag */ __be32 id; /* upper 3 bits not used */ u8 dlc; /* data length code 0-8 bytes */ u8 data[8]; /* 64-bit data */ u8 end; }; /* rx frame */ struct __packed usb_8dev_rx_msg { u8 begin; u8 type; /* frame type */ u8 flags; /* RTR and EXT_ID flag */ __be32 id; /* upper 3 bits not used */ u8 dlc; /* data length code 0-8 bytes */ u8 data[8]; /* 64-bit data */ __be32 timestamp; /* 32-bit timestamp */ u8 end; }; /* command frame */ struct __packed usb_8dev_cmd_msg { u8 begin; u8 channel; /* unknown - always 0 */ u8 command; /* command to execute */ u8 opt1; /* optional parameter / return value */ u8 opt2; /* optional parameter 2 */ u8 data[10]; /* optional parameter and data */ u8 end; }; static int usb_8dev_send_cmd_msg(struct usb_8dev_priv *priv, u8 *msg, int size) { int actual_length; return usb_bulk_msg(priv->udev, usb_sndbulkpipe(priv->udev, USB_8DEV_ENDP_CMD_TX), msg, size, &actual_length, USB_8DEV_CMD_TIMEOUT); } static int usb_8dev_wait_cmd_msg(struct usb_8dev_priv *priv, u8 *msg, int size, int *actual_length) { return usb_bulk_msg(priv->udev, usb_rcvbulkpipe(priv->udev, USB_8DEV_ENDP_CMD_RX), msg, size, actual_length, USB_8DEV_CMD_TIMEOUT); } /* Send command to device and receive result. * Command was successful when opt1 = 0. */ static int usb_8dev_send_cmd(struct usb_8dev_priv *priv, struct usb_8dev_cmd_msg *out, struct usb_8dev_cmd_msg *in) { int err; int num_bytes_read; struct net_device *netdev; netdev = priv->netdev; out->begin = USB_8DEV_CMD_START; out->end = USB_8DEV_CMD_END; mutex_lock(&priv->usb_8dev_cmd_lock); memcpy(priv->cmd_msg_buffer, out, sizeof(struct usb_8dev_cmd_msg)); err = usb_8dev_send_cmd_msg(priv, priv->cmd_msg_buffer, sizeof(struct usb_8dev_cmd_msg)); if (err < 0) { netdev_err(netdev, "sending command message failed\n"); goto failed; } err = usb_8dev_wait_cmd_msg(priv, priv->cmd_msg_buffer, sizeof(struct usb_8dev_cmd_msg), &num_bytes_read); if (err < 0) { netdev_err(netdev, "no command message answer\n"); goto failed; } memcpy(in, priv->cmd_msg_buffer, sizeof(struct usb_8dev_cmd_msg)); if (in->begin != USB_8DEV_CMD_START || in->end != USB_8DEV_CMD_END || num_bytes_read != 16 || in->opt1 != 0) err = -EPROTO; failed: mutex_unlock(&priv->usb_8dev_cmd_lock); return err; } /* Send open command to device */ static int usb_8dev_cmd_open(struct usb_8dev_priv *priv) { struct can_bittiming *bt = &priv->can.bittiming; struct usb_8dev_cmd_msg outmsg; struct usb_8dev_cmd_msg inmsg; u32 ctrlmode = priv->can.ctrlmode; u32 flags = USB_8DEV_STATUS_FRAME; __be32 beflags; __be16 bebrp; memset(&outmsg, 0, sizeof(outmsg)); outmsg.command = USB_8DEV_OPEN; outmsg.opt1 = USB_8DEV_BAUD_MANUAL; outmsg.data[0] = bt->prop_seg + bt->phase_seg1; outmsg.data[1] = bt->phase_seg2; outmsg.data[2] = bt->sjw; /* BRP */ bebrp = cpu_to_be16((u16)bt->brp); memcpy(&outmsg.data[3], &bebrp, sizeof(bebrp)); /* flags */ if (ctrlmode & CAN_CTRLMODE_LOOPBACK) flags |= USB_8DEV_LOOPBACK; if (ctrlmode & CAN_CTRLMODE_LISTENONLY) flags |= USB_8DEV_SILENT; if (ctrlmode & CAN_CTRLMODE_ONE_SHOT) flags |= USB_8DEV_DISABLE_AUTO_RESTRANS; beflags = cpu_to_be32(flags); memcpy(&outmsg.data[5], &beflags, sizeof(beflags)); return usb_8dev_send_cmd(priv, &outmsg, &inmsg); } /* Send close command to device */ static int usb_8dev_cmd_close(struct usb_8dev_priv *priv) { struct usb_8dev_cmd_msg inmsg; struct usb_8dev_cmd_msg outmsg = { .channel = 0, .command = USB_8DEV_CLOSE, .opt1 = 0, .opt2 = 0 }; return usb_8dev_send_cmd(priv, &outmsg, &inmsg); } /* Get firmware and hardware version */ static int usb_8dev_cmd_version(struct usb_8dev_priv *priv, u32 *res) { struct usb_8dev_cmd_msg inmsg; struct usb_8dev_cmd_msg outmsg = { .channel = 0, .command = USB_8DEV_GET_SOFTW_HARDW_VER, .opt1 = 0, .opt2 = 0 }; int err = usb_8dev_send_cmd(priv, &outmsg, &inmsg); if (err) return err; *res = be32_to_cpup((__be32 *)inmsg.data); return err; } /* Set network device mode * * Maybe we should leave this function empty, because the device * set mode variable with open command. */ static int usb_8dev_set_mode(struct net_device *netdev, enum can_mode mode) { struct usb_8dev_priv *priv = netdev_priv(netdev); int err = 0; switch (mode) { case CAN_MODE_START: err = usb_8dev_cmd_open(priv); if (err) netdev_warn(netdev, "couldn't start device"); break; default: return -EOPNOTSUPP; } return err; } /* Read error/status frames */ static void usb_8dev_rx_err_msg(struct usb_8dev_priv *priv, struct usb_8dev_rx_msg *msg) { struct can_frame *cf; struct sk_buff *skb; struct net_device_stats *stats = &priv->netdev->stats; /* Error message: * byte 0: Status * byte 1: bit 7: Receive Passive * byte 1: bit 0-6: Receive Error Counter * byte 2: Transmit Error Counter * byte 3: Always 0 (maybe reserved for future use) */ u8 state = msg->data[0]; u8 rxerr = msg->data[1] & USB_8DEV_RP_MASK; u8 txerr = msg->data[2]; int rx_errors = 0; int tx_errors = 0; skb = alloc_can_err_skb(priv->netdev, &cf); if (!skb) return; switch (state) { case USB_8DEV_STATUSMSG_OK: priv->can.state = CAN_STATE_ERROR_ACTIVE; cf->can_id |= CAN_ERR_PROT; cf->data[2] = CAN_ERR_PROT_ACTIVE; break; case USB_8DEV_STATUSMSG_BUSOFF: priv->can.state = CAN_STATE_BUS_OFF; cf->can_id |= CAN_ERR_BUSOFF; priv->can.can_stats.bus_off++; can_bus_off(priv->netdev); break; case USB_8DEV_STATUSMSG_OVERRUN: case USB_8DEV_STATUSMSG_BUSLIGHT: case USB_8DEV_STATUSMSG_BUSHEAVY: cf->can_id |= CAN_ERR_CRTL; break; default: priv->can.state = CAN_STATE_ERROR_WARNING; cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; priv->can.can_stats.bus_error++; break; } switch (state) { case USB_8DEV_STATUSMSG_OK: case USB_8DEV_STATUSMSG_BUSOFF: break; case USB_8DEV_STATUSMSG_ACK: cf->can_id |= CAN_ERR_ACK; tx_errors = 1; break; case USB_8DEV_STATUSMSG_CRC: cf->data[3] = CAN_ERR_PROT_LOC_CRC_SEQ; rx_errors = 1; break; case USB_8DEV_STATUSMSG_BIT0: cf->data[2] |= CAN_ERR_PROT_BIT0; tx_errors = 1; break; case USB_8DEV_STATUSMSG_BIT1: cf->data[2] |= CAN_ERR_PROT_BIT1; tx_errors = 1; break; case USB_8DEV_STATUSMSG_FORM: cf->data[2] |= CAN_ERR_PROT_FORM; rx_errors = 1; break; case USB_8DEV_STATUSMSG_STUFF: cf->data[2] |= CAN_ERR_PROT_STUFF; rx_errors = 1; break; case USB_8DEV_STATUSMSG_OVERRUN: cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW; stats->rx_over_errors++; rx_errors = 1; break; case USB_8DEV_STATUSMSG_BUSLIGHT: priv->can.state = CAN_STATE_ERROR_WARNING; cf->data[1] = (txerr > rxerr) ? CAN_ERR_CRTL_TX_WARNING : CAN_ERR_CRTL_RX_WARNING; priv->can.can_stats.error_warning++; break; case USB_8DEV_STATUSMSG_BUSHEAVY: priv->can.state = CAN_STATE_ERROR_PASSIVE; cf->data[1] = (txerr > rxerr) ? CAN_ERR_CRTL_TX_PASSIVE : CAN_ERR_CRTL_RX_PASSIVE; priv->can.can_stats.error_passive++; break; default: netdev_warn(priv->netdev, "Unknown status/error message (%d)\n", state); break; } if (tx_errors) { cf->data[2] |= CAN_ERR_PROT_TX; stats->tx_errors++; } if (rx_errors) stats->rx_errors++; if (priv->can.state != CAN_STATE_BUS_OFF) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = txerr; cf->data[7] = rxerr; } priv->bec.txerr = txerr; priv->bec.rxerr = rxerr; netif_rx(skb); } /* Read data and status frames */ static void usb_8dev_rx_can_msg(struct usb_8dev_priv *priv, struct usb_8dev_rx_msg *msg) { struct can_frame *cf; struct sk_buff *skb; struct net_device_stats *stats = &priv->netdev->stats; if (msg->type == USB_8DEV_TYPE_ERROR_FRAME && msg->flags == USB_8DEV_ERR_FLAG) { usb_8dev_rx_err_msg(priv, msg); } else if (msg->type == USB_8DEV_TYPE_CAN_FRAME) { skb = alloc_can_skb(priv->netdev, &cf); if (!skb) return; cf->can_id = be32_to_cpu(msg->id); can_frame_set_cc_len(cf, msg->dlc & 0xF, priv->can.ctrlmode); if (msg->flags & USB_8DEV_EXTID) cf->can_id |= CAN_EFF_FLAG; if (msg->flags & USB_8DEV_RTR) { cf->can_id |= CAN_RTR_FLAG; } else { memcpy(cf->data, msg->data, cf->len); stats->rx_bytes += cf->len; } stats->rx_packets++; netif_rx(skb); } else { netdev_warn(priv->netdev, "frame type %d unknown", msg->type); } } /* Callback for reading data from device * * Check urb status, call read function and resubmit urb read operation. */ static void usb_8dev_read_bulk_callback(struct urb *urb) { struct usb_8dev_priv *priv = urb->context; struct net_device *netdev; int retval; int pos = 0; netdev = priv->netdev; if (!netif_device_present(netdev)) return; switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: netdev_info(netdev, "Rx URB aborted (%d)\n", urb->status); goto resubmit_urb; } while (pos < urb->actual_length) { struct usb_8dev_rx_msg *msg; if (pos + sizeof(struct usb_8dev_rx_msg) > urb->actual_length) { netdev_err(priv->netdev, "format error\n"); break; } msg = (struct usb_8dev_rx_msg *)(urb->transfer_buffer + pos); usb_8dev_rx_can_msg(priv, msg); pos += sizeof(struct usb_8dev_rx_msg); } resubmit_urb: usb_fill_bulk_urb(urb, priv->udev, usb_rcvbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_RX), urb->transfer_buffer, RX_BUFFER_SIZE, usb_8dev_read_bulk_callback, priv); retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval == -ENODEV) netif_device_detach(netdev); else if (retval) netdev_err(netdev, "failed resubmitting read bulk urb: %d\n", retval); } /* Callback handler for write operations * * Free allocated buffers, check transmit status and * calculate statistic. */ static void usb_8dev_write_bulk_callback(struct urb *urb) { struct usb_8dev_tx_urb_context *context = urb->context; struct usb_8dev_priv *priv; struct net_device *netdev; BUG_ON(!context); priv = context->priv; netdev = priv->netdev; /* free up our allocated buffer */ usb_free_coherent(urb->dev, urb->transfer_buffer_length, urb->transfer_buffer, urb->transfer_dma); atomic_dec(&priv->active_tx_urbs); if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "Tx URB aborted (%d)\n", urb->status); netdev->stats.tx_packets++; netdev->stats.tx_bytes += can_get_echo_skb(netdev, context->echo_index, NULL); /* Release context */ context->echo_index = MAX_TX_URBS; netif_wake_queue(netdev); } /* Send data to device */ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct usb_8dev_priv *priv = netdev_priv(netdev); struct net_device_stats *stats = &netdev->stats; struct can_frame *cf = (struct can_frame *) skb->data; struct usb_8dev_tx_msg *msg; struct urb *urb; struct usb_8dev_tx_urb_context *context = NULL; u8 *buf; int i, err; size_t size = sizeof(struct usb_8dev_tx_msg); if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; /* create a URB, and a buffer for it, and copy the data to the URB */ urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) goto nomem; buf = usb_alloc_coherent(priv->udev, size, GFP_ATOMIC, &urb->transfer_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); goto nomembuf; } memset(buf, 0, size); msg = (struct usb_8dev_tx_msg *)buf; msg->begin = USB_8DEV_DATA_START; msg->flags = 0x00; if (cf->can_id & CAN_RTR_FLAG) msg->flags |= USB_8DEV_RTR; if (cf->can_id & CAN_EFF_FLAG) msg->flags |= USB_8DEV_EXTID; msg->id = cpu_to_be32(cf->can_id & CAN_ERR_MASK); msg->dlc = can_get_cc_dlc(cf, priv->can.ctrlmode); memcpy(msg->data, cf->data, cf->len); msg->end = USB_8DEV_DATA_END; for (i = 0; i < MAX_TX_URBS; i++) { if (priv->tx_contexts[i].echo_index == MAX_TX_URBS) { context = &priv->tx_contexts[i]; break; } } /* May never happen! When this happens we'd more URBs in flight as * allowed (MAX_TX_URBS). */ if (!context) goto nofreecontext; context->priv = priv; context->echo_index = i; usb_fill_bulk_urb(urb, priv->udev, usb_sndbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_TX), buf, size, usb_8dev_write_bulk_callback, context); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->tx_submitted); can_put_echo_skb(skb, netdev, context->echo_index, 0); atomic_inc(&priv->active_tx_urbs); err = usb_submit_urb(urb, GFP_ATOMIC); if (unlikely(err)) { can_free_echo_skb(netdev, context->echo_index, NULL); usb_unanchor_urb(urb); usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); atomic_dec(&priv->active_tx_urbs); if (err == -ENODEV) netif_device_detach(netdev); else netdev_warn(netdev, "failed tx_urb %d\n", err); stats->tx_dropped++; } else if (atomic_read(&priv->active_tx_urbs) >= MAX_TX_URBS) /* Slow down tx path */ netif_stop_queue(netdev); /* Release our reference to this URB, the USB core will eventually free * it entirely. */ usb_free_urb(urb); return NETDEV_TX_OK; nofreecontext: usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); usb_free_urb(urb); netdev_warn(netdev, "couldn't find free context"); return NETDEV_TX_BUSY; nomembuf: usb_free_urb(urb); nomem: dev_kfree_skb(skb); stats->tx_dropped++; return NETDEV_TX_OK; } static int usb_8dev_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct usb_8dev_priv *priv = netdev_priv(netdev); bec->txerr = priv->bec.txerr; bec->rxerr = priv->bec.rxerr; return 0; } /* Start USB device */ static int usb_8dev_start(struct usb_8dev_priv *priv) { struct net_device *netdev = priv->netdev; int err, i; for (i = 0; i < MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf; dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { err = -ENOMEM; break; } buf = usb_alloc_coherent(priv->udev, RX_BUFFER_SIZE, GFP_KERNEL, &buf_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); usb_free_urb(urb); err = -ENOMEM; break; } urb->transfer_dma = buf_dma; usb_fill_bulk_urb(urb, priv->udev, usb_rcvbulkpipe(priv->udev, USB_8DEV_ENDP_DATA_RX), buf, RX_BUFFER_SIZE, usb_8dev_read_bulk_callback, priv); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; usb_anchor_urb(urb, &priv->rx_submitted); err = usb_submit_urb(urb, GFP_KERNEL); if (err) { usb_unanchor_urb(urb); usb_free_coherent(priv->udev, RX_BUFFER_SIZE, buf, urb->transfer_dma); usb_free_urb(urb); break; } priv->rxbuf[i] = buf; priv->rxbuf_dma[i] = buf_dma; /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); } /* Did we submit any URBs */ if (i == 0) { netdev_warn(netdev, "couldn't setup read URBs\n"); return err; } /* Warn if we've couldn't transmit all the URBs */ if (i < MAX_RX_URBS) netdev_warn(netdev, "rx performance may be slow\n"); err = usb_8dev_cmd_open(priv); if (err) goto failed; priv->can.state = CAN_STATE_ERROR_ACTIVE; return 0; failed: if (err == -ENODEV) netif_device_detach(priv->netdev); netdev_warn(netdev, "couldn't submit control: %d\n", err); return err; } /* Open USB device */ static int usb_8dev_open(struct net_device *netdev) { struct usb_8dev_priv *priv = netdev_priv(netdev); int err; /* common open */ err = open_candev(netdev); if (err) return err; /* finally start device */ err = usb_8dev_start(priv); if (err) { if (err == -ENODEV) netif_device_detach(priv->netdev); netdev_warn(netdev, "couldn't start device: %d\n", err); close_candev(netdev); return err; } netif_start_queue(netdev); return 0; } static void unlink_all_urbs(struct usb_8dev_priv *priv) { int i; usb_kill_anchored_urbs(&priv->rx_submitted); for (i = 0; i < MAX_RX_URBS; ++i) usb_free_coherent(priv->udev, RX_BUFFER_SIZE, priv->rxbuf[i], priv->rxbuf_dma[i]); usb_kill_anchored_urbs(&priv->tx_submitted); atomic_set(&priv->active_tx_urbs, 0); for (i = 0; i < MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = MAX_TX_URBS; } /* Close USB device */ static int usb_8dev_close(struct net_device *netdev) { struct usb_8dev_priv *priv = netdev_priv(netdev); int err = 0; /* Send CLOSE command to CAN controller */ err = usb_8dev_cmd_close(priv); if (err) netdev_warn(netdev, "couldn't stop device"); priv->can.state = CAN_STATE_STOPPED; netif_stop_queue(netdev); /* Stop polling */ unlink_all_urbs(priv); close_candev(netdev); return err; } static const struct net_device_ops usb_8dev_netdev_ops = { .ndo_open = usb_8dev_open, .ndo_stop = usb_8dev_close, .ndo_start_xmit = usb_8dev_start_xmit, .ndo_change_mtu = can_change_mtu, }; static const struct ethtool_ops usb_8dev_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static const struct can_bittiming_const usb_8dev_bittiming_const = { .name = KBUILD_MODNAME, .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 1024, .brp_inc = 1, }; /* Probe USB device * * Check device and firmware. * Set supported modes and bittiming constants. * Allocate some memory. */ static int usb_8dev_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct net_device *netdev; struct usb_8dev_priv *priv; int i, err = -ENOMEM; u32 version; char buf[18]; struct usb_device *usbdev = interface_to_usbdev(intf); /* product id looks strange, better we also check iProduct string */ if (usb_string(usbdev, usbdev->descriptor.iProduct, buf, sizeof(buf)) > 0 && strcmp(buf, "USB2CAN converter")) { dev_info(&usbdev->dev, "ignoring: not an USB2CAN converter\n"); return -ENODEV; } netdev = alloc_candev(sizeof(struct usb_8dev_priv), MAX_TX_URBS); if (!netdev) { dev_err(&intf->dev, "Couldn't alloc candev\n"); return -ENOMEM; } priv = netdev_priv(netdev); priv->udev = usbdev; priv->netdev = netdev; priv->can.state = CAN_STATE_STOPPED; priv->can.clock.freq = USB_8DEV_ABP_CLOCK; priv->can.bittiming_const = &usb_8dev_bittiming_const; priv->can.do_set_mode = usb_8dev_set_mode; priv->can.do_get_berr_counter = usb_8dev_get_berr_counter; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_ONE_SHOT | CAN_CTRLMODE_CC_LEN8_DLC; netdev->netdev_ops = &usb_8dev_netdev_ops; netdev->ethtool_ops = &usb_8dev_ethtool_ops; netdev->flags |= IFF_ECHO; /* we support local echo */ init_usb_anchor(&priv->rx_submitted); init_usb_anchor(&priv->tx_submitted); atomic_set(&priv->active_tx_urbs, 0); for (i = 0; i < MAX_TX_URBS; i++) priv->tx_contexts[i].echo_index = MAX_TX_URBS; priv->cmd_msg_buffer = devm_kzalloc(&intf->dev, sizeof(struct usb_8dev_cmd_msg), GFP_KERNEL); if (!priv->cmd_msg_buffer) goto cleanup_candev; usb_set_intfdata(intf, priv); SET_NETDEV_DEV(netdev, &intf->dev); mutex_init(&priv->usb_8dev_cmd_lock); err = register_candev(netdev); if (err) { netdev_err(netdev, "couldn't register CAN device: %d\n", err); goto cleanup_candev; } err = usb_8dev_cmd_version(priv, &version); if (err) { netdev_err(netdev, "can't get firmware version\n"); goto cleanup_unregister_candev; } else { netdev_info(netdev, "firmware: %d.%d, hardware: %d.%d\n", (version>>24) & 0xff, (version>>16) & 0xff, (version>>8) & 0xff, version & 0xff); } return 0; cleanup_unregister_candev: unregister_netdev(priv->netdev); cleanup_candev: free_candev(netdev); return err; } /* Called by the usb core when driver is unloaded or device is removed */ static void usb_8dev_disconnect(struct usb_interface *intf) { struct usb_8dev_priv *priv = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (priv) { netdev_info(priv->netdev, "device disconnected\n"); unregister_netdev(priv->netdev); unlink_all_urbs(priv); free_candev(priv->netdev); } } static struct usb_driver usb_8dev_driver = { .name = KBUILD_MODNAME, .probe = usb_8dev_probe, .disconnect = usb_8dev_disconnect, .id_table = usb_8dev_table, }; module_usb_driver(usb_8dev_driver); MODULE_AUTHOR("Bernd Krumboeck <krumboeck@universalnet.at>"); MODULE_DESCRIPTION("CAN driver for 8 devices USB2CAN interfaces"); MODULE_LICENSE("GPL v2");
2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0-only /* * Frontend driver for the GENPIX 8pks/qpsk/DCII USB2.0 DVB-S module * * Copyright (C) 2006,2007 Alan Nisota (alannisota@gmail.com) * Copyright (C) 2006,2007 Genpix Electronics (genpix@genpix-electronics.com) * * Thanks to GENPIX for the sample code used to implement this module. * * This module is based off the vp7045 and vp702x modules */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "gp8psk-fe.h" #include <media/dvb_frontend.h> static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off)."); #define dprintk(fmt, arg...) do { \ if (debug) \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) struct gp8psk_fe_state { struct dvb_frontend fe; void *priv; const struct gp8psk_fe_ops *ops; bool is_rev1; u8 lock; u16 snr; unsigned long next_status_check; unsigned long status_check_interval; }; static int gp8psk_tuned_to_DCII(struct dvb_frontend *fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 status; st->ops->in(st->priv, GET_8PSK_CONFIG, 0, 0, &status, 1); return status & bmDCtuned; } static int gp8psk_set_tuner_mode(struct dvb_frontend *fe, int mode) { struct gp8psk_fe_state *st = fe->demodulator_priv; return st->ops->out(st->priv, SET_8PSK_CONFIG, mode, 0, NULL, 0); } static int gp8psk_fe_update_status(struct gp8psk_fe_state *st) { u8 buf[6]; if (time_after(jiffies,st->next_status_check)) { st->ops->in(st->priv, GET_SIGNAL_LOCK, 0, 0, &st->lock, 1); st->ops->in(st->priv, GET_SIGNAL_STRENGTH, 0, 0, buf, 6); st->snr = (buf[1]) << 8 | buf[0]; st->next_status_check = jiffies + (st->status_check_interval*HZ)/1000; } return 0; } static int gp8psk_fe_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); if (st->lock) *status = FE_HAS_LOCK | FE_HAS_SYNC | FE_HAS_VITERBI | FE_HAS_SIGNAL | FE_HAS_CARRIER; else *status = 0; if (*status & FE_HAS_LOCK) st->status_check_interval = 1000; else st->status_check_interval = 100; return 0; } /* not supported by this Frontend */ static int gp8psk_fe_read_ber(struct dvb_frontend* fe, u32 *ber) { (void) fe; *ber = 0; return 0; } /* not supported by this Frontend */ static int gp8psk_fe_read_unc_blocks(struct dvb_frontend* fe, u32 *unc) { (void) fe; *unc = 0; return 0; } static int gp8psk_fe_read_snr(struct dvb_frontend* fe, u16 *snr) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); /* snr is reported in dBu*256 */ *snr = st->snr; return 0; } static int gp8psk_fe_read_signal_strength(struct dvb_frontend* fe, u16 *strength) { struct gp8psk_fe_state *st = fe->demodulator_priv; gp8psk_fe_update_status(st); /* snr is reported in dBu*256 */ /* snr / 38.4 ~= 100% strength */ /* snr * 17 returns 100% strength as 65535 */ if (st->snr > 0xf00) *strength = 0xffff; else *strength = (st->snr << 4) + st->snr; /* snr*17 */ return 0; } static int gp8psk_fe_get_tune_settings(struct dvb_frontend* fe, struct dvb_frontend_tune_settings *tune) { tune->min_delay_ms = 800; return 0; } static int gp8psk_fe_set_frontend(struct dvb_frontend *fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; struct dtv_frontend_properties *c = &fe->dtv_property_cache; u8 cmd[10]; u32 freq = c->frequency * 1000; dprintk("%s()\n", __func__); cmd[4] = freq & 0xff; cmd[5] = (freq >> 8) & 0xff; cmd[6] = (freq >> 16) & 0xff; cmd[7] = (freq >> 24) & 0xff; /* backwards compatibility: DVB-S + 8-PSK were used for Turbo-FEC */ if (c->delivery_system == SYS_DVBS && c->modulation == PSK_8) c->delivery_system = SYS_TURBO; switch (c->delivery_system) { case SYS_DVBS: if (c->modulation != QPSK) { dprintk("%s: unsupported modulation selected (%d)\n", __func__, c->modulation); return -EOPNOTSUPP; } c->fec_inner = FEC_AUTO; break; case SYS_DVBS2: /* kept for backwards compatibility */ dprintk("%s: DVB-S2 delivery system selected\n", __func__); break; case SYS_TURBO: dprintk("%s: Turbo-FEC delivery system selected\n", __func__); break; default: dprintk("%s: unsupported delivery system selected (%d)\n", __func__, c->delivery_system); return -EOPNOTSUPP; } cmd[0] = c->symbol_rate & 0xff; cmd[1] = (c->symbol_rate >> 8) & 0xff; cmd[2] = (c->symbol_rate >> 16) & 0xff; cmd[3] = (c->symbol_rate >> 24) & 0xff; switch (c->modulation) { case QPSK: if (st->is_rev1) if (gp8psk_tuned_to_DCII(fe)) st->ops->reload(st->priv); switch (c->fec_inner) { case FEC_1_2: cmd[9] = 0; break; case FEC_2_3: cmd[9] = 1; break; case FEC_3_4: cmd[9] = 2; break; case FEC_5_6: cmd[9] = 3; break; case FEC_7_8: cmd[9] = 4; break; case FEC_AUTO: cmd[9] = 5; break; default: cmd[9] = 5; break; } if (c->delivery_system == SYS_TURBO) cmd[8] = ADV_MOD_TURBO_QPSK; else cmd[8] = ADV_MOD_DVB_QPSK; break; case PSK_8: /* PSK_8 is for compatibility with DN */ cmd[8] = ADV_MOD_TURBO_8PSK; switch (c->fec_inner) { case FEC_2_3: cmd[9] = 0; break; case FEC_3_4: cmd[9] = 1; break; case FEC_3_5: cmd[9] = 2; break; case FEC_5_6: cmd[9] = 3; break; case FEC_8_9: cmd[9] = 4; break; default: cmd[9] = 0; break; } break; case QAM_16: /* QAM_16 is for compatibility with DN */ cmd[8] = ADV_MOD_TURBO_16QAM; cmd[9] = 0; break; default: /* Unknown modulation */ dprintk("%s: unsupported modulation selected (%d)\n", __func__, c->modulation); return -EOPNOTSUPP; } if (st->is_rev1) gp8psk_set_tuner_mode(fe, 0); st->ops->out(st->priv, TUNE_8PSK, 0, 0, cmd, 10); st->lock = 0; st->next_status_check = jiffies; st->status_check_interval = 200; return 0; } static int gp8psk_fe_send_diseqc_msg (struct dvb_frontend* fe, struct dvb_diseqc_master_cmd *m) { struct gp8psk_fe_state *st = fe->demodulator_priv; dprintk("%s\n", __func__); if (st->ops->out(st->priv, SEND_DISEQC_COMMAND, m->msg[0], 0, m->msg, m->msg_len)) { return -EINVAL; } return 0; } static int gp8psk_fe_send_diseqc_burst(struct dvb_frontend *fe, enum fe_sec_mini_cmd burst) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 cmd; dprintk("%s\n", __func__); /* These commands are certainly wrong */ cmd = (burst == SEC_MINI_A) ? 0x00 : 0x01; if (st->ops->out(st->priv, SEND_DISEQC_COMMAND, cmd, 0, &cmd, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_set_tone(struct dvb_frontend *fe, enum fe_sec_tone_mode tone) { struct gp8psk_fe_state *st = fe->demodulator_priv; if (st->ops->out(st->priv, SET_22KHZ_TONE, (tone == SEC_TONE_ON), 0, NULL, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { struct gp8psk_fe_state *st = fe->demodulator_priv; if (st->ops->out(st->priv, SET_LNB_VOLTAGE, voltage == SEC_VOLTAGE_18, 0, NULL, 0)) { return -EINVAL; } return 0; } static int gp8psk_fe_enable_high_lnb_voltage(struct dvb_frontend* fe, long onoff) { struct gp8psk_fe_state *st = fe->demodulator_priv; return st->ops->out(st->priv, USE_EXTRA_VOLT, onoff, 0, NULL, 0); } static int gp8psk_fe_send_legacy_dish_cmd (struct dvb_frontend* fe, unsigned long sw_cmd) { struct gp8psk_fe_state *st = fe->demodulator_priv; u8 cmd = sw_cmd & 0x7f; if (st->ops->out(st->priv, SET_DN_SWITCH, cmd, 0, NULL, 0)) return -EINVAL; if (st->ops->out(st->priv, SET_LNB_VOLTAGE, !!(sw_cmd & 0x80), 0, NULL, 0)) return -EINVAL; return 0; } static void gp8psk_fe_release(struct dvb_frontend* fe) { struct gp8psk_fe_state *st = fe->demodulator_priv; kfree(st); } static const struct dvb_frontend_ops gp8psk_fe_ops; struct dvb_frontend *gp8psk_fe_attach(const struct gp8psk_fe_ops *ops, void *priv, bool is_rev1) { struct gp8psk_fe_state *st; if (!ops || !ops->in || !ops->out || !ops->reload) { pr_err("Error! gp8psk-fe ops not defined.\n"); return NULL; } st = kzalloc(sizeof(struct gp8psk_fe_state), GFP_KERNEL); if (!st) return NULL; memcpy(&st->fe.ops, &gp8psk_fe_ops, sizeof(struct dvb_frontend_ops)); st->fe.demodulator_priv = st; st->ops = ops; st->priv = priv; st->is_rev1 = is_rev1; pr_info("Frontend %sattached\n", is_rev1 ? "revision 1 " : ""); return &st->fe; } EXPORT_SYMBOL_GPL(gp8psk_fe_attach); static const struct dvb_frontend_ops gp8psk_fe_ops = { .delsys = { SYS_DVBS }, .info = { .name = "Genpix DVB-S", .frequency_min_hz = 800 * MHz, .frequency_max_hz = 2250 * MHz, .frequency_stepsize_hz = 100 * kHz, .symbol_rate_min = 1000000, .symbol_rate_max = 45000000, .symbol_rate_tolerance = 500, /* ppm */ .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | /* * FE_CAN_QAM_16 is for compatibility * (Myth incorrectly detects Turbo-QPSK as plain QAM-16) */ FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_TURBO_FEC }, .release = gp8psk_fe_release, .init = NULL, .sleep = NULL, .set_frontend = gp8psk_fe_set_frontend, .get_tune_settings = gp8psk_fe_get_tune_settings, .read_status = gp8psk_fe_read_status, .read_ber = gp8psk_fe_read_ber, .read_signal_strength = gp8psk_fe_read_signal_strength, .read_snr = gp8psk_fe_read_snr, .read_ucblocks = gp8psk_fe_read_unc_blocks, .diseqc_send_master_cmd = gp8psk_fe_send_diseqc_msg, .diseqc_send_burst = gp8psk_fe_send_diseqc_burst, .set_tone = gp8psk_fe_set_tone, .set_voltage = gp8psk_fe_set_voltage, .dishnetwork_send_legacy_command = gp8psk_fe_send_legacy_dish_cmd, .enable_high_lnb_voltage = gp8psk_fe_enable_high_lnb_voltage }; MODULE_AUTHOR("Alan Nisota <alannisota@gamil.com>"); MODULE_DESCRIPTION("Frontend Driver for Genpix DVB-S"); MODULE_VERSION("1.1"); MODULE_LICENSE("GPL");
4 1 3 8 8 1 7 2 1 4 3 2 2 2 4 6 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 // SPDX-License-Identifier: GPL-2.0-only /* fs/fat/nfs.c */ #include <linux/exportfs.h> #include "fat.h" struct fat_fid { u32 i_gen; u32 i_pos_low; u16 i_pos_hi; u16 parent_i_pos_hi; u32 parent_i_pos_low; u32 parent_i_gen; }; #define FAT_FID_SIZE_WITHOUT_PARENT 3 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) /* * Look up a directory inode given its starting cluster. */ static struct inode *fat_dget(struct super_block *sb, int i_logstart) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head; struct msdos_inode_info *i; struct inode *inode = NULL; head = sbi->dir_hashtable + fat_dir_hash(i_logstart); spin_lock(&sbi->dir_hash_lock); hlist_for_each_entry(i, head, i_dir_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_logstart != i_logstart) continue; inode = igrab(&i->vfs_inode); if (inode) break; } spin_unlock(&sbi->dir_hash_lock); return inode; } static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos) { if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) return fat_iget(sb, i_pos); else { if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) return NULL; return ilookup(sb, ino); } } static struct inode *__fat_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation, loff_t i_pos) { struct inode *inode = fat_ilookup(sb, ino, i_pos); if (inode && generation && (inode->i_generation != generation)) { iput(inode); inode = NULL; } if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) { struct buffer_head *bh = NULL; struct msdos_dir_entry *de ; sector_t blocknr; int offset; fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset); bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read block(%llu) for building NFS inode", (llu)blocknr); return inode; } de = (struct msdos_dir_entry *)bh->b_data; /* If a file is deleted on server and client is not updated * yet, we must not build the inode upon a lookup call. */ if (IS_FREE(de[offset].name)) inode = NULL; else inode = fat_build_inode(sb, &de[offset], i_pos); brelse(bh); } return inode; } static struct inode *fat_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { return __fat_nfs_get_inode(sb, ino, generation, 0); } static int fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent) { int len = *lenp; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); struct fat_fid *fid = (struct fat_fid *) fh; loff_t i_pos; int type = FILEID_FAT_WITHOUT_PARENT; if (parent) { if (len < FAT_FID_SIZE_WITH_PARENT) { *lenp = FAT_FID_SIZE_WITH_PARENT; return FILEID_INVALID; } } else { if (len < FAT_FID_SIZE_WITHOUT_PARENT) { *lenp = FAT_FID_SIZE_WITHOUT_PARENT; return FILEID_INVALID; } } i_pos = fat_i_pos_read(sbi, inode); *lenp = FAT_FID_SIZE_WITHOUT_PARENT; fid->i_gen = inode->i_generation; fid->i_pos_low = i_pos & 0xFFFFFFFF; fid->i_pos_hi = (i_pos >> 32) & 0xFFFF; if (parent) { i_pos = fat_i_pos_read(sbi, parent); fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF; fid->parent_i_pos_low = i_pos & 0xFFFFFFFF; fid->parent_i_gen = parent->i_generation; type = FILEID_FAT_WITH_PARENT; *lenp = FAT_FID_SIZE_WITH_PARENT; } else { /* * We need to initialize this field because the fh is actually * 12 bytes long */ fid->parent_i_pos_hi = 0; } return type; } /* * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { struct inode *inode = NULL; struct fat_fid *fid = (struct fat_fid *)fh; loff_t i_pos; switch (fh_type) { case FILEID_FAT_WITHOUT_PARENT: if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT) return NULL; break; case FILEID_FAT_WITH_PARENT: if (fh_len < FAT_FID_SIZE_WITH_PARENT) return NULL; break; default: return NULL; } i_pos = fid->i_pos_hi; i_pos = (i_pos << 32) | (fid->i_pos_low); inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos); return d_obtain_alias(inode); } /* * Find the parent for a file specified by NFS handle. * This requires that the handle contain the i_ino of the parent. */ static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { struct inode *inode = NULL; struct fat_fid *fid = (struct fat_fid *)fh; loff_t i_pos; if (fh_len < FAT_FID_SIZE_WITH_PARENT) return NULL; switch (fh_type) { case FILEID_FAT_WITH_PARENT: i_pos = fid->parent_i_pos_hi; i_pos = (i_pos << 32) | (fid->parent_i_pos_low); inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos); break; } return d_obtain_alias(inode); } /* * Rebuild the parent for a directory that is not connected * to the filesystem root */ static struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) { int search_clus, clus_to_match; struct msdos_dir_entry *de; struct inode *parent = NULL; struct inode *dummy_grand_parent = NULL; struct fat_slot_info sinfo; struct msdos_sb_info *sbi = MSDOS_SB(sb); sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart); struct buffer_head *parent_bh = sb_bread(sb, blknr); if (!parent_bh) { fat_msg(sb, KERN_ERR, "unable to read cluster of parent directory"); return NULL; } de = (struct msdos_dir_entry *) parent_bh->b_data; clus_to_match = fat_get_start(sbi, &de[0]); search_clus = fat_get_start(sbi, &de[1]); dummy_grand_parent = fat_dget(sb, search_clus); if (!dummy_grand_parent) { dummy_grand_parent = new_inode(sb); if (!dummy_grand_parent) { brelse(parent_bh); return parent; } dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO); fat_fill_inode(dummy_grand_parent, &de[1]); MSDOS_I(dummy_grand_parent)->i_pos = -1; } if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo)) parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos); brelse(parent_bh); iput(dummy_grand_parent); return parent; } /* * Find the parent for a directory that is not currently connected to * the filesystem root. * * On entry, the caller holds d_inode(child_dir)->i_mutex. */ static struct dentry *fat_get_parent(struct dentry *child_dir) { struct super_block *sb = child_dir->d_sb; struct buffer_head *bh = NULL; struct msdos_dir_entry *de; struct inode *parent_inode = NULL; struct msdos_sb_info *sbi = MSDOS_SB(sb); if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) { int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) parent_inode = fat_rebuild_parent(sb, parent_logstart); } brelse(bh); return d_obtain_alias(parent_inode); } const struct export_operations fat_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = fat_fh_to_dentry, .fh_to_parent = fat_fh_to_parent, .get_parent = fat_get_parent, }; const struct export_operations fat_export_ops_nostale = { .encode_fh = fat_encode_fh_nostale, .fh_to_dentry = fat_fh_to_dentry_nostale, .fh_to_parent = fat_fh_to_parent_nostale, .get_parent = fat_get_parent, };
1 11 10 2 9 2 1 1 3 1 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2002 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2002-2005 by David Brownell */ // #define DEBUG // error path messages, extra info // #define VERBOSE // more; success messages #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/crc32.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> /* * All known Zaurii lie about their standards conformance. At least * the earliest SA-1100 models lie by saying they support CDC Ethernet. * Some later models (especially PXA-25x and PXA-27x based ones) lie * and say they support CDC MDLM (for access to cell phone modems). * * There are non-Zaurus products that use these same protocols too. * * The annoying thing is that at the same time Sharp was developing * that annoying standards-breaking software, the Linux community had * a simple "CDC Subset" working reliably on the same SA-1100 hardware. * That is, the same functionality but not violating standards. * * The CDC Ethernet nonconformance points are troublesome to hosts * with a true CDC Ethernet implementation: * - Framing appends a CRC, which the spec says drivers "must not" do; * - Transfers data in altsetting zero, instead of altsetting 1; * - All these peripherals use the same ethernet address. * * The CDC MDLM nonconformance is less immediately troublesome, since all * MDLM implementations are quasi-proprietary anyway. */ static struct sk_buff * zaurus_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; struct sk_buff *skb2; padlen = 2; if (!skb_cloned(skb)) { int tailroom = skb_tailroom(skb); if ((padlen + 4) <= tailroom) goto done; } skb2 = skb_copy_expand(skb, 0, 4 + padlen, flags); dev_kfree_skb_any(skb); skb = skb2; if (skb) { u32 fcs; done: fcs = crc32_le(~0, skb->data, skb->len); fcs = ~fcs; skb_put_u8(skb, fcs & 0xff); skb_put_u8(skb, (fcs >> 8) & 0xff); skb_put_u8(skb, (fcs >> 16) & 0xff); skb_put_u8(skb, (fcs >> 24) & 0xff); } return skb; } static int zaurus_bind(struct usbnet *dev, struct usb_interface *intf) { /* Belcarra's funky framing has other options; mostly * TRAILERS (!) with 4 bytes CRC, and maybe 2 pad bytes. */ dev->net->hard_header_len += 6; dev->rx_urb_size = dev->net->hard_header_len + dev->net->mtu; return usbnet_generic_cdc_bind(dev, intf); } /* PDA style devices are always connected if present */ static int always_connected (struct usbnet *dev) { return 0; } static const struct driver_info zaurus_sl5x00_info = { .description = "Sharp Zaurus SL-5x00", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_Z, .check_connect = always_connected, .bind = zaurus_bind, .unbind = usbnet_cdc_unbind, .tx_fixup = zaurus_tx_fixup, }; #define ZAURUS_STRONGARM_INFO ((unsigned long)&zaurus_sl5x00_info) static const struct driver_info zaurus_pxa_info = { .description = "Sharp Zaurus, PXA-2xx based", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_Z, .check_connect = always_connected, .bind = zaurus_bind, .unbind = usbnet_cdc_unbind, .tx_fixup = zaurus_tx_fixup, }; #define ZAURUS_PXA_INFO ((unsigned long)&zaurus_pxa_info) static const struct driver_info olympus_mxl_info = { .description = "Olympus R1000", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_Z, .check_connect = always_connected, .bind = zaurus_bind, .unbind = usbnet_cdc_unbind, .tx_fixup = zaurus_tx_fixup, }; #define OLYMPUS_MXL_INFO ((unsigned long)&olympus_mxl_info) /* Some more recent products using Lineo/Belcarra code will wrongly claim * CDC MDLM conformance. They aren't conformant: data endpoints live * in the control interface, there's no data interface, and it's not used * to talk to a cell phone radio. But at least we can detect these two * pseudo-classes, rather than growing this product list with entries for * each new nonconformant product (sigh). */ static const u8 safe_guid[16] = { 0x5d, 0x34, 0xcf, 0x66, 0x11, 0x18, 0x11, 0xd6, 0xa2, 0x1a, 0x00, 0x01, 0x02, 0xca, 0x9a, 0x7f, }; static const u8 blan_guid[16] = { 0x74, 0xf0, 0x3d, 0xbd, 0x1e, 0xc1, 0x44, 0x70, 0xa3, 0x67, 0x71, 0x34, 0xc9, 0xf5, 0x54, 0x37, }; static int blan_mdlm_bind(struct usbnet *dev, struct usb_interface *intf) { u8 *buf = intf->cur_altsetting->extra; int len = intf->cur_altsetting->extralen; struct usb_cdc_mdlm_desc *desc = NULL; struct usb_cdc_mdlm_detail_desc *detail = NULL; while (len > 3) { if (buf [1] != USB_DT_CS_INTERFACE) goto next_desc; /* use bDescriptorSubType, and just verify that we get a * "BLAN" (or "SAFE") descriptor. */ switch (buf [2]) { case USB_CDC_MDLM_TYPE: if (desc) { dev_dbg(&intf->dev, "extra MDLM\n"); goto bad_desc; } desc = (void *) buf; if (desc->bLength != sizeof *desc) { dev_dbg(&intf->dev, "MDLM len %u\n", desc->bLength); goto bad_desc; } /* expect bcdVersion 1.0, ignore */ if (memcmp(&desc->bGUID, blan_guid, 16) && memcmp(&desc->bGUID, safe_guid, 16)) { /* hey, this one might _really_ be MDLM! */ dev_dbg(&intf->dev, "MDLM guid\n"); goto bad_desc; } break; case USB_CDC_MDLM_DETAIL_TYPE: if (detail) { dev_dbg(&intf->dev, "extra MDLM detail\n"); goto bad_desc; } detail = (void *) buf; switch (detail->bGuidDescriptorType) { case 0: /* "SAFE" */ if (detail->bLength != (sizeof *detail + 2)) goto bad_detail; break; case 1: /* "BLAN" */ if (detail->bLength != (sizeof *detail + 3)) goto bad_detail; break; default: goto bad_detail; } /* assuming we either noticed BLAN already, or will * find it soon, there are some data bytes here: * - bmNetworkCapabilities (unused) * - bmDataCapabilities (bits, see below) * - bPad (ignored, for PADAFTER -- BLAN-only) * bits are: * - 0x01 -- Zaurus framing (add CRC) * - 0x02 -- PADBEFORE (CRC includes some padding) * - 0x04 -- PADAFTER (some padding after CRC) * - 0x08 -- "fermat" packet mangling (for hw bugs) * the PADBEFORE appears not to matter; we interop * with devices that use it and those that don't. */ if ((detail->bDetailData[1] & ~0x02) != 0x01) { /* bmDataCapabilities == 0 would be fine too, * but framing is minidriver-coupled for now. */ bad_detail: dev_dbg(&intf->dev, "bad MDLM detail, %d %d %d\n", detail->bLength, detail->bDetailData[0], detail->bDetailData[2]); goto bad_desc; } /* same extra framing as for non-BLAN mode */ dev->net->hard_header_len += 6; dev->rx_urb_size = dev->net->hard_header_len + dev->net->mtu; break; } next_desc: len -= buf [0]; /* bLength */ buf += buf [0]; } if (!desc || !detail) { dev_dbg(&intf->dev, "missing cdc mdlm %s%sdescriptor\n", desc ? "" : "func ", detail ? "" : "detail "); goto bad_desc; } /* There's probably a CDC Ethernet descriptor there, but we can't * rely on the Ethernet address it provides since not all vendors * bother to make it unique. Likewise there's no point in tracking * of the CDC event notifications. */ return usbnet_get_endpoints(dev, intf); bad_desc: dev_info(&dev->udev->dev, "unsupported MDLM descriptors\n"); return -ENODEV; } static const struct driver_info bogus_mdlm_info = { .description = "pseudo-MDLM (BLAN) device", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_Z, .check_connect = always_connected, .tx_fixup = zaurus_tx_fixup, .bind = blan_mdlm_bind, }; static const struct usb_device_id products [] = { #define ZAURUS_MASTER_INTERFACE \ .bInterfaceClass = USB_CLASS_COMM, \ .bInterfaceSubClass = USB_CDC_SUBCLASS_ETHERNET, \ .bInterfaceProtocol = USB_CDC_PROTO_NONE #define ZAURUS_FAKE_INTERFACE \ .bInterfaceClass = USB_CLASS_COMM, \ .bInterfaceSubClass = USB_CDC_SUBCLASS_MDLM, \ .bInterfaceProtocol = USB_CDC_PROTO_NONE /* SA-1100 based Sharp Zaurus ("collie"), or compatible. */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8004, ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_STRONGARM_INFO, }, /* PXA-2xx based models are also lying-about-cdc. If you add any * more devices that claim to be CDC Ethernet, make sure they get * added to the blacklist in cdc_ether too. * * NOTE: OpenZaurus versions with 2.6 kernels won't use these entries, * unlike the older ones with 2.4 "embedix" kernels. */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8005, /* A-300 */ ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_PXA_INFO, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8005, /* A-300 */ ZAURUS_FAKE_INTERFACE, .driver_info = (unsigned long)&bogus_mdlm_info, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8006, /* B-500/SL-5600 */ ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_PXA_INFO, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8006, /* B-500/SL-5600 */ ZAURUS_FAKE_INTERFACE, .driver_info = (unsigned long)&bogus_mdlm_info, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8007, /* C-700 */ ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_PXA_INFO, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x8007, /* C-700 */ ZAURUS_FAKE_INTERFACE, .driver_info = (unsigned long)&bogus_mdlm_info, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x9031, /* C-750 C-760 */ ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_PXA_INFO, }, { /* C-750/C-760/C-860/SL-C3000 PDA in MDLM mode */ USB_DEVICE_AND_INTERFACE_INFO(0x04DD, 0x9031, USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &bogus_mdlm_info, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x9032, /* SL-6000 */ ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_PXA_INFO, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, .idProduct = 0x9032, /* SL-6000 */ ZAURUS_FAKE_INTERFACE, .driver_info = (unsigned long)&bogus_mdlm_info, }, { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x04DD, /* reported with some C860 units */ .idProduct = 0x9050, /* C-860 */ ZAURUS_MASTER_INTERFACE, .driver_info = ZAURUS_PXA_INFO, }, { /* Motorola Rokr E6 */ USB_DEVICE_AND_INTERFACE_INFO(0x22b8, 0x6027, USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &bogus_mdlm_info, }, { /* Motorola MOTOMAGX phones */ USB_DEVICE_AND_INTERFACE_INFO(0x22b8, 0x6425, USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &bogus_mdlm_info, }, /* Olympus has some models with a Zaurus-compatible option. * R-1000 uses a FreeScale i.MXL cpu (ARMv4T) */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_DEVICE, .idVendor = 0x07B4, .idProduct = 0x0F02, /* R-1000 */ ZAURUS_MASTER_INTERFACE, .driver_info = OLYMPUS_MXL_INFO, }, /* Logitech Harmony 900 - uses the pseudo-MDLM (BLAN) driver */ { USB_DEVICE_AND_INTERFACE_INFO(0x046d, 0xc11f, USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &bogus_mdlm_info, }, { }, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver zaurus_driver = { .name = "zaurus", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(zaurus_driver); MODULE_AUTHOR("Pavel Machek, David Brownell"); MODULE_DESCRIPTION("Sharp Zaurus PDA, and compatible products"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BKEY_BUF_H #define _BCACHEFS_BKEY_BUF_H #include "bcachefs.h" #include "bkey.h" struct bkey_buf { struct bkey_i *k; u64 onstack[12]; }; static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, struct bch_fs *c, unsigned u64s) { if (s->k == (void *) s->onstack && u64s > ARRAY_SIZE(s->onstack)) { s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); memcpy(s->k, s->onstack, sizeof(s->onstack)); } } static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, struct bch_fs *c, struct bkey_s_c k) { bch2_bkey_buf_realloc(s, c, k.k->u64s); bkey_reassemble(s->k, k); } static inline void bch2_bkey_buf_copy(struct bkey_buf *s, struct bch_fs *c, struct bkey_i *src) { bch2_bkey_buf_realloc(s, c, src->k.u64s); bkey_copy(s->k, src); } static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, struct bch_fs *c, struct btree *b, struct bkey_packed *src) { bch2_bkey_buf_realloc(s, c, BKEY_U64s + bkeyp_val_u64s(&b->format, src)); bch2_bkey_unpack(b, s->k, src); } static inline void bch2_bkey_buf_init(struct bkey_buf *s) { s->k = (void *) s->onstack; } static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) { if (s->k != (void *) s->onstack) mempool_free(s->k, &c->large_bkey_pool); s->k = NULL; } #endif /* _BCACHEFS_BKEY_BUF_H */
11459 11828 11825 48244 48249 11837 11831 9039 9045 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts */ #include <linux/preempt.h> #include <linux/kdb.h> #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/printk.h> #include <linux/kprobes.h> #include "internal.h" /* Context where printk messages are never suppressed */ static atomic_t force_con; void printk_force_console_enter(void) { atomic_inc(&force_con); } void printk_force_console_exit(void) { atomic_dec(&force_con); } bool is_printk_force_console(void) { return atomic_read(&force_con); } static DEFINE_PER_CPU(int, printk_context); /* Can be preempted by NMI. */ void __printk_safe_enter(void) { this_cpu_inc(printk_context); } /* Can be preempted by NMI. */ void __printk_safe_exit(void) { this_cpu_dec(printk_context); } void __printk_deferred_enter(void) { cant_migrate(); __printk_safe_enter(); } void __printk_deferred_exit(void) { cant_migrate(); __printk_safe_exit(); } bool is_printk_legacy_deferred(void) { /* * The per-CPU variable @printk_context can be read safely in any * context. CPU migration is always disabled when set. */ return (force_legacy_kthread() || this_cpu_read(printk_context) || in_nmi()); } asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB /* Allow to pass printk() to kdb but avoid a recursion. */ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); #endif /* * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ if (is_printk_legacy_deferred()) return vprintk_deferred(fmt, args); /* No obstacles. */ return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk);
1 1 8 11 23 23 1 1 1 1 1 1 1 3 1 1 1 1 22 23 3 5 5 5 5 5 22 23 23 23 23 22 21 4 4 1 1 1 1 1 1 9 9 9 9 9 1 1 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 // SPDX-License-Identifier: LGPL-2.1+ /* Copyright (C) 2022 Kent Overstreet */ #include <linux/bitmap.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include "printbuf.h" static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) { return pos - buf->last_newline; } static inline unsigned printbuf_linelen(struct printbuf *buf) { return __printbuf_linelen(buf, buf->pos); } /* * Returns spaces from start of line, if set, or 0 if unset: */ static inline unsigned cur_tabstop(struct printbuf *buf) { return buf->cur_tabstop < buf->nr_tabstops ? buf->_tabstops[buf->cur_tabstop] : 0; } int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) { /* Reserved space for terminating nul: */ extra += 1; if (out->pos + extra <= out->size) return 0; if (!out->heap_allocated) { out->overflow = true; return 0; } unsigned new_size = roundup_pow_of_two(out->size + extra); /* Sanity check... */ if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) { out->allocation_failure = true; out->overflow = true; return -ENOMEM; } /* * Note: output buffer must be freeable with kfree(), it's not required * that the user use printbuf_exit(). */ char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); if (!buf) { out->allocation_failure = true; out->overflow = true; return -ENOMEM; } out->buf = buf; out->size = new_size; return 0; } static void printbuf_advance_pos(struct printbuf *out, unsigned len) { out->pos += min(len, printbuf_remaining(out)); } static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) { unsigned move = out->pos - pos; bch2_printbuf_make_room(out, nr); if (pos + nr < out->size) memmove(out->buf + pos + nr, out->buf + pos, min(move, out->size - 1 - pos - nr)); if (pos < out->size) memset(out->buf + pos, ' ', min(nr, out->size - pos)); printbuf_advance_pos(out, nr); printbuf_nul_terminate_reserved(out); } static void __printbuf_do_indent(struct printbuf *out, unsigned pos) { while (true) { int pad; unsigned len = out->pos - pos; char *p = out->buf + pos; char *n = memscan(p, '\n', len); if (cur_tabstop(out)) { n = min(n, (char *) memscan(p, '\r', len)); n = min(n, (char *) memscan(p, '\t', len)); } pos = n - out->buf; if (pos == out->pos) break; switch (*n) { case '\n': pos++; out->last_newline = pos; printbuf_insert_spaces(out, pos, out->indent); pos = min(pos + out->indent, out->pos); out->last_field = pos; out->cur_tabstop = 0; break; case '\r': memmove(n, n + 1, out->pos - pos); --out->pos; pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); if (pad > 0) { printbuf_insert_spaces(out, out->last_field, pad); pos += pad; } out->last_field = pos; out->cur_tabstop++; break; case '\t': pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; if (pad > 0) { *n = ' '; printbuf_insert_spaces(out, pos, pad - 1); pos += pad; } else { memmove(n, n + 1, out->pos - pos); --out->pos; } out->last_field = pos; out->cur_tabstop++; break; } } } static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) { if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) __printbuf_do_indent(out, pos); } void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) { int len; do { va_list args2; va_copy(args2, args); len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); va_end(args2); } while (len > printbuf_remaining(out) && !bch2_printbuf_make_room(out, len)); unsigned indent_pos = out->pos; printbuf_advance_pos(out, len); printbuf_do_indent(out, indent_pos); } void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) { va_list args; int len; do { va_start(args, fmt); len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); va_end(args); } while (len > printbuf_remaining(out) && !bch2_printbuf_make_room(out, len)); unsigned indent_pos = out->pos; printbuf_advance_pos(out, len); printbuf_do_indent(out, indent_pos); } /** * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be * null terminated * @buf: printbuf to terminate * Returns: Printbuf contents, as a nul terminated C string */ const char *bch2_printbuf_str(const struct printbuf *buf) { /* * If we've written to a printbuf then it's guaranteed to be a null * terminated string - but if we haven't, then we might not have * allocated a buffer at all: */ return buf->pos ? buf->buf : ""; } /** * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it * against accidental use. * @buf: printbuf to exit */ void bch2_printbuf_exit(struct printbuf *buf) { if (buf->heap_allocated) { kfree(buf->buf); buf->buf = ERR_PTR(-EINTR); /* poison value */ } } void bch2_printbuf_tabstops_reset(struct printbuf *buf) { buf->nr_tabstops = 0; } void bch2_printbuf_tabstop_pop(struct printbuf *buf) { if (buf->nr_tabstops) --buf->nr_tabstops; } /* * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop * * @buf: printbuf to control * @spaces: number of spaces from previous tabpstop * * In the future this function may allocate memory if setting more than * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start * of line. */ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) { unsigned prev_tabstop = buf->nr_tabstops ? buf->_tabstops[buf->nr_tabstops - 1] : 0; if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) return -EINVAL; buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; buf->has_indent_or_tabstops = true; return 0; } /** * bch2_printbuf_indent_add() - add to the current indent level * * @buf: printbuf to control * @spaces: number of spaces to add to the current indent level * * Subsequent lines, and the current line if the output position is at the start * of the current line, will be indented by @spaces more spaces. */ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) { if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) spaces = 0; buf->indent += spaces; prt_chars(buf, ' ', spaces); buf->has_indent_or_tabstops = true; } /** * bch2_printbuf_indent_sub() - subtract from the current indent level * * @buf: printbuf to control * @spaces: number of spaces to subtract from the current indent level * * Subsequent lines, and the current line if the output position is at the start * of the current line, will be indented by @spaces less spaces. */ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) { if (WARN_ON_ONCE(spaces > buf->indent)) spaces = buf->indent; if (buf->last_newline + buf->indent == buf->pos) { buf->pos -= spaces; printbuf_nul_terminate(buf); } buf->indent -= spaces; if (!buf->indent && !buf->nr_tabstops) buf->has_indent_or_tabstops = false; } void bch2_prt_newline(struct printbuf *buf) { bch2_printbuf_make_room(buf, 1 + buf->indent); __prt_char_reserved(buf, '\n'); buf->last_newline = buf->pos; __prt_chars_reserved(buf, ' ', buf->indent); printbuf_nul_terminate_reserved(buf); buf->last_field = buf->pos; buf->cur_tabstop = 0; } void bch2_printbuf_strip_trailing_newline(struct printbuf *out) { for (int p = out->pos - 1; p >= 0; --p) { if (out->buf[p] == '\n') { out->pos = p; break; } if (out->buf[p] != ' ') break; } printbuf_nul_terminate_reserved(out); } static void __prt_tab(struct printbuf *out) { int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); prt_chars(out, ' ', spaces); out->last_field = out->pos; out->cur_tabstop++; } /** * bch2_prt_tab() - Advance printbuf to the next tabstop * @out: printbuf to control * * Advance output to the next tabstop by printing spaces. */ void bch2_prt_tab(struct printbuf *out) { if (WARN_ON(!cur_tabstop(out))) return; __prt_tab(out); } static void __prt_tab_rjust(struct printbuf *buf) { int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); if (pad > 0) printbuf_insert_spaces(buf, buf->last_field, pad); buf->last_field = buf->pos; buf->cur_tabstop++; } /** * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying * previous output * * @buf: printbuf to control * * Advance output to the next tabstop by inserting spaces immediately after the * previous tabstop, right justifying previously outputted text. */ void bch2_prt_tab_rjust(struct printbuf *buf) { if (WARN_ON(!cur_tabstop(buf))) return; __prt_tab_rjust(buf); } /** * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters * * @out: output printbuf * @str: string to print * @count: number of bytes to print * * The following contol characters are handled as so: * \n: prt_newline newline that obeys current indent level * \t: prt_tab advance to next tabstop * \r: prt_tab_rjust advance to next tabstop, with right justification */ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { unsigned indent_pos = out->pos; prt_bytes(out, str, count); printbuf_do_indent(out, indent_pos); } /** * bch2_prt_human_readable_u64() - Print out a u64 in human readable units * @out: output printbuf * @v: integer to print * * Units of 2^10 (default) or 10^3 are controlled via @out->si_units */ void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { bch2_printbuf_make_room(out, 10); unsigned len = string_get_size(v, 1, !out->si_units, out->buf + out->pos, printbuf_remaining_size(out)); printbuf_advance_pos(out, len); } /** * bch2_prt_human_readable_s64() - Print out a s64 in human readable units * @out: output printbuf * @v: integer to print * * Units of 2^10 (default) or 10^3 are controlled via @out->si_units */ void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) { if (v < 0) prt_char(out, '-'); bch2_prt_human_readable_u64(out, abs(v)); } /** * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options * @out: output printbuf * @v: integer to print * * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) */ void bch2_prt_units_u64(struct printbuf *out, u64 v) { if (out->human_readable_units) bch2_prt_human_readable_u64(out, v); else bch2_prt_printf(out, "%llu", v); } /** * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options * @out: output printbuf * @v: integer to print * * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) */ void bch2_prt_units_s64(struct printbuf *out, s64 v) { if (v < 0) prt_char(out, '-'); bch2_prt_units_u64(out, abs(v)); } void bch2_prt_string_option(struct printbuf *out, const char * const list[], size_t selected) { for (size_t i = 0; list[i]; i++) bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); } void bch2_prt_bitflags(struct printbuf *out, const char * const list[], u64 flags) { unsigned bit, nr = 0; bool first = true; while (list[nr]) nr++; while (flags && (bit = __ffs64(flags)) < nr) { if (!first) bch2_prt_printf(out, ","); first = false; bch2_prt_printf(out, "%s", list[bit]); flags ^= BIT_ULL(bit); } } void bch2_prt_bitflags_vector(struct printbuf *out, const char * const list[], unsigned long *v, unsigned nr) { bool first = true; unsigned i; for (i = 0; i < nr; i++) if (!list[i]) { nr = i - 1; break; } for_each_set_bit(i, v, nr) { if (!first) bch2_prt_printf(out, ","); first = false; bch2_prt_printf(out, "%s", list[i]); } }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 // SPDX-License-Identifier: GPL-2.0+ /* * Infinity Unlimited USB Phoenix driver * * Copyright (C) 2010 James Courtier-Dutton (James@superbug.co.uk) * Copyright (C) 2007 Alain Degreffe (eczema@ecze.com) * * Original code taken from iuutool (Copyright (C) 2006 Juan Carlos Borrás) * * And tested with help of WB Electronics */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include "iuu_phoenix.h" #include <linux/random.h> #define DRIVER_DESC "Infinity USB Unlimited Phoenix driver" static const struct usb_device_id id_table[] = { {USB_DEVICE(IUU_USB_VENDOR_ID, IUU_USB_PRODUCT_ID)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); /* turbo parameter */ static int boost = 100; static int clockmode = 1; static int cdmode = 1; static int iuu_cardin; static int iuu_cardout; static bool xmas; static int vcc_default = 5; static int iuu_create_sysfs_attrs(struct usb_serial_port *port); static int iuu_remove_sysfs_attrs(struct usb_serial_port *port); static void read_rxcmd_callback(struct urb *urb); struct iuu_private { spinlock_t lock; /* store irq state */ u8 line_status; int tiostatus; /* store IUART SIGNAL for tiocmget call */ u8 reset; /* if 1 reset is needed */ int poll; /* number of poll */ u8 *writebuf; /* buffer for writing to device */ int writelen; /* num of byte to write to device */ u8 *buf; /* used for initialize speed */ u8 len; int vcc; /* vcc (either 3 or 5 V) */ u32 boost; u32 clk; }; static int iuu_port_probe(struct usb_serial_port *port) { struct iuu_private *priv; int ret; priv = kzalloc(sizeof(struct iuu_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->buf = kzalloc(256, GFP_KERNEL); if (!priv->buf) { kfree(priv); return -ENOMEM; } priv->writebuf = kzalloc(256, GFP_KERNEL); if (!priv->writebuf) { kfree(priv->buf); kfree(priv); return -ENOMEM; } priv->vcc = vcc_default; spin_lock_init(&priv->lock); usb_set_serial_port_data(port, priv); ret = iuu_create_sysfs_attrs(port); if (ret) { kfree(priv->writebuf); kfree(priv->buf); kfree(priv); return ret; } return 0; } static void iuu_port_remove(struct usb_serial_port *port) { struct iuu_private *priv = usb_get_serial_port_data(port); iuu_remove_sysfs_attrs(port); kfree(priv->writebuf); kfree(priv->buf); kfree(priv); } static int iuu_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct iuu_private *priv = usb_get_serial_port_data(port); unsigned long flags; /* FIXME: locking on tiomstatus */ dev_dbg(&port->dev, "%s msg : SET = 0x%04x, CLEAR = 0x%04x\n", __func__, set, clear); spin_lock_irqsave(&priv->lock, flags); if ((set & TIOCM_RTS) && !(priv->tiostatus == TIOCM_RTS)) { dev_dbg(&port->dev, "%s TIOCMSET RESET called !!!\n", __func__); priv->reset = 1; } if (set & TIOCM_RTS) priv->tiostatus = TIOCM_RTS; spin_unlock_irqrestore(&priv->lock, flags); return 0; } /* This is used to provide a carrier detect mechanism * When a card is present, the response is 0x00 * When no card , the reader respond with TIOCM_CD * This is known as CD autodetect mechanism */ static int iuu_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct iuu_private *priv = usb_get_serial_port_data(port); unsigned long flags; int rc; spin_lock_irqsave(&priv->lock, flags); rc = priv->tiostatus; spin_unlock_irqrestore(&priv->lock, flags); return rc; } static void iuu_rxcmd(struct urb *urb) { struct usb_serial_port *port = urb->context; int status = urb->status; if (status) { dev_dbg(&port->dev, "%s - status = %d\n", __func__, status); /* error stop all */ return; } memset(port->write_urb->transfer_buffer, IUU_UART_RX, 1); usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, 1, read_rxcmd_callback, port); usb_submit_urb(port->write_urb, GFP_ATOMIC); } static int iuu_reset(struct usb_serial_port *port, u8 wt) { struct iuu_private *priv = usb_get_serial_port_data(port); int result; char *buf_ptr = port->write_urb->transfer_buffer; /* Prepare the reset sequence */ *buf_ptr++ = IUU_RST_SET; *buf_ptr++ = IUU_DELAY_MS; *buf_ptr++ = wt; *buf_ptr = IUU_RST_CLEAR; /* send the sequence */ usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, 4, iuu_rxcmd, port); result = usb_submit_urb(port->write_urb, GFP_ATOMIC); priv->reset = 0; return result; } /* Status Function * Return value is * 0x00 = no card * 0x01 = smartcard * 0x02 = sim card */ static void iuu_update_status_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct iuu_private *priv = usb_get_serial_port_data(port); u8 *st; int status = urb->status; if (status) { dev_dbg(&port->dev, "%s - status = %d\n", __func__, status); /* error stop all */ return; } st = urb->transfer_buffer; dev_dbg(&port->dev, "%s - enter\n", __func__); if (urb->actual_length == 1) { switch (st[0]) { case 0x1: priv->tiostatus = iuu_cardout; break; case 0x0: priv->tiostatus = iuu_cardin; break; default: priv->tiostatus = iuu_cardin; } } iuu_rxcmd(urb); } static void iuu_status_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; int status = urb->status; dev_dbg(&port->dev, "%s - status = %d\n", __func__, status); usb_fill_bulk_urb(port->read_urb, port->serial->dev, usb_rcvbulkpipe(port->serial->dev, port->bulk_in_endpointAddress), port->read_urb->transfer_buffer, 256, iuu_update_status_callback, port); usb_submit_urb(port->read_urb, GFP_ATOMIC); } static int iuu_status(struct usb_serial_port *port) { int result; memset(port->write_urb->transfer_buffer, IUU_GET_STATE_REGISTER, 1); usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, 1, iuu_status_callback, port); result = usb_submit_urb(port->write_urb, GFP_ATOMIC); return result; } static int bulk_immediate(struct usb_serial_port *port, u8 *buf, u8 count) { int status; struct usb_serial *serial = port->serial; int actual = 0; /* send the data out the bulk port */ status = usb_bulk_msg(serial->dev, usb_sndbulkpipe(serial->dev, port->bulk_out_endpointAddress), buf, count, &actual, 1000); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - error = %2x\n", __func__, status); else dev_dbg(&port->dev, "%s - write OK !\n", __func__); return status; } static int read_immediate(struct usb_serial_port *port, u8 *buf, u8 count) { int status; struct usb_serial *serial = port->serial; int actual = 0; /* send the data out the bulk port */ status = usb_bulk_msg(serial->dev, usb_rcvbulkpipe(serial->dev, port->bulk_in_endpointAddress), buf, count, &actual, 1000); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - error = %2x\n", __func__, status); else dev_dbg(&port->dev, "%s - read OK !\n", __func__); return status; } static int iuu_led(struct usb_serial_port *port, unsigned int R, unsigned int G, unsigned int B, u8 f) { int status; u8 *buf; buf = kmalloc(8, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = IUU_SET_LED; buf[1] = R & 0xFF; buf[2] = (R >> 8) & 0xFF; buf[3] = G & 0xFF; buf[4] = (G >> 8) & 0xFF; buf[5] = B & 0xFF; buf[6] = (B >> 8) & 0xFF; buf[7] = f; status = bulk_immediate(port, buf, 8); kfree(buf); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - led error status = %2x\n", __func__, status); else dev_dbg(&port->dev, "%s - led OK !\n", __func__); return IUU_OPERATION_OK; } static void iuu_rgbf_fill_buffer(u8 *buf, u8 r1, u8 r2, u8 g1, u8 g2, u8 b1, u8 b2, u8 freq) { *buf++ = IUU_SET_LED; *buf++ = r1; *buf++ = r2; *buf++ = g1; *buf++ = g2; *buf++ = b1; *buf++ = b2; *buf = freq; } static void iuu_led_activity_on(struct urb *urb) { struct usb_serial_port *port = urb->context; char *buf_ptr = port->write_urb->transfer_buffer; if (xmas) { buf_ptr[0] = IUU_SET_LED; get_random_bytes(buf_ptr + 1, 6); buf_ptr[7] = 1; } else { iuu_rgbf_fill_buffer(buf_ptr, 255, 255, 0, 0, 0, 0, 255); } usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, 8 , iuu_rxcmd, port); usb_submit_urb(port->write_urb, GFP_ATOMIC); } static void iuu_led_activity_off(struct urb *urb) { struct usb_serial_port *port = urb->context; char *buf_ptr = port->write_urb->transfer_buffer; if (xmas) { iuu_rxcmd(urb); return; } iuu_rgbf_fill_buffer(buf_ptr, 0, 0, 255, 255, 0, 0, 255); usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, 8 , iuu_rxcmd, port); usb_submit_urb(port->write_urb, GFP_ATOMIC); } static int iuu_clk(struct usb_serial_port *port, int dwFrq) { int status; struct iuu_private *priv = usb_get_serial_port_data(port); int Count = 0; u8 FrqGenAdr = 0x69; u8 DIV = 0; /* 8bit */ u8 XDRV = 0; /* 8bit */ u8 PUMP = 0; /* 3bit */ u8 PBmsb = 0; /* 2bit */ u8 PBlsb = 0; /* 8bit */ u8 PO = 0; /* 1bit */ u8 Q = 0; /* 7bit */ /* 24bit = 3bytes */ unsigned int P = 0; unsigned int P2 = 0; int frq = (int)dwFrq; if (frq == 0) { priv->buf[Count++] = IUU_UART_WRITE_I2C; priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x09; priv->buf[Count++] = 0x00; status = bulk_immediate(port, (u8 *) priv->buf, Count); if (status != 0) { dev_dbg(&port->dev, "%s - write error\n", __func__); return status; } } else if (frq == 3579000) { DIV = 100; P = 1193; Q = 40; XDRV = 0; } else if (frq == 3680000) { DIV = 105; P = 161; Q = 5; XDRV = 0; } else if (frq == 6000000) { DIV = 66; P = 66; Q = 2; XDRV = 0x28; } else { unsigned int result = 0; unsigned int tmp = 0; unsigned int check; unsigned int check2; char found = 0x00; unsigned int lQ = 2; unsigned int lP = 2055; unsigned int lDiv = 4; for (lQ = 2; lQ <= 47 && !found; lQ++) for (lP = 2055; lP >= 8 && !found; lP--) for (lDiv = 4; lDiv <= 127 && !found; lDiv++) { tmp = (12000000 / lDiv) * (lP / lQ); if (abs((int)(tmp - frq)) < abs((int)(frq - result))) { check2 = (12000000 / lQ); if (check2 < 250000) continue; check = (12000000 / lQ) * lP; if (check > 400000000) continue; if (check < 100000000) continue; if (lDiv < 4 || lDiv > 127) continue; result = tmp; P = lP; DIV = lDiv; Q = lQ; if (result == frq) found = 0x01; } } } P2 = ((P - PO) / 2) - 4; PUMP = 0x04; PBmsb = (P2 >> 8 & 0x03); PBlsb = P2 & 0xFF; PO = (P >> 10) & 0x01; Q = Q - 2; priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x09; priv->buf[Count++] = 0x20; /* Adr = 0x09 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x0C; priv->buf[Count++] = DIV; /* Adr = 0x0C */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x12; priv->buf[Count++] = XDRV; /* Adr = 0x12 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x13; priv->buf[Count++] = 0x6B; /* Adr = 0x13 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x40; priv->buf[Count++] = (0xC0 | ((PUMP & 0x07) << 2)) | (PBmsb & 0x03); /* Adr = 0x40 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x41; priv->buf[Count++] = PBlsb; /* Adr = 0x41 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x42; priv->buf[Count++] = Q | (((PO & 0x01) << 7)); /* Adr = 0x42 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x44; priv->buf[Count++] = (char)0xFF; /* Adr = 0x44 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x45; priv->buf[Count++] = (char)0xFE; /* Adr = 0x45 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x46; priv->buf[Count++] = 0x7F; /* Adr = 0x46 */ priv->buf[Count++] = IUU_UART_WRITE_I2C; /* 0x4C */ priv->buf[Count++] = FrqGenAdr << 1; priv->buf[Count++] = 0x47; priv->buf[Count++] = (char)0x84; /* Adr = 0x47 */ status = bulk_immediate(port, (u8 *) priv->buf, Count); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - write error\n", __func__); return status; } static int iuu_uart_flush(struct usb_serial_port *port) { struct device *dev = &port->dev; int i; int status; u8 *rxcmd; struct iuu_private *priv = usb_get_serial_port_data(port); if (iuu_led(port, 0xF000, 0, 0, 0xFF) < 0) return -EIO; rxcmd = kmalloc(1, GFP_KERNEL); if (!rxcmd) return -ENOMEM; rxcmd[0] = IUU_UART_RX; for (i = 0; i < 2; i++) { status = bulk_immediate(port, rxcmd, 1); if (status != IUU_OPERATION_OK) { dev_dbg(dev, "%s - uart_flush_write error\n", __func__); goto out_free; } status = read_immediate(port, &priv->len, 1); if (status != IUU_OPERATION_OK) { dev_dbg(dev, "%s - uart_flush_read error\n", __func__); goto out_free; } if (priv->len > 0) { dev_dbg(dev, "%s - uart_flush datalen is : %i\n", __func__, priv->len); status = read_immediate(port, priv->buf, priv->len); if (status != IUU_OPERATION_OK) { dev_dbg(dev, "%s - uart_flush_read error\n", __func__); goto out_free; } } } dev_dbg(dev, "%s - uart_flush_read OK!\n", __func__); iuu_led(port, 0, 0xF000, 0, 0xFF); out_free: kfree(rxcmd); return status; } static void read_buf_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; int status = urb->status; if (status) { if (status == -EPROTO) { /* reschedule needed */ } return; } dev_dbg(&port->dev, "%s - %i chars to write\n", __func__, urb->actual_length); if (urb->actual_length) { tty_insert_flip_string(&port->port, data, urb->actual_length); tty_flip_buffer_push(&port->port); } iuu_led_activity_on(urb); } static int iuu_bulk_write(struct usb_serial_port *port) { struct iuu_private *priv = usb_get_serial_port_data(port); unsigned long flags; int result; int buf_len; char *buf_ptr = port->write_urb->transfer_buffer; spin_lock_irqsave(&priv->lock, flags); *buf_ptr++ = IUU_UART_ESC; *buf_ptr++ = IUU_UART_TX; *buf_ptr++ = priv->writelen; memcpy(buf_ptr, priv->writebuf, priv->writelen); buf_len = priv->writelen; priv->writelen = 0; spin_unlock_irqrestore(&priv->lock, flags); dev_dbg(&port->dev, "%s - writing %i chars : %*ph\n", __func__, buf_len, buf_len, buf_ptr); usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, buf_len + 3, iuu_rxcmd, port); result = usb_submit_urb(port->write_urb, GFP_ATOMIC); usb_serial_port_softint(port); return result; } static int iuu_read_buf(struct usb_serial_port *port, int len) { int result; usb_fill_bulk_urb(port->read_urb, port->serial->dev, usb_rcvbulkpipe(port->serial->dev, port->bulk_in_endpointAddress), port->read_urb->transfer_buffer, len, read_buf_callback, port); result = usb_submit_urb(port->read_urb, GFP_ATOMIC); return result; } static void iuu_uart_read_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct iuu_private *priv = usb_get_serial_port_data(port); unsigned long flags; int status = urb->status; int len = 0; unsigned char *data = urb->transfer_buffer; priv->poll++; if (status) { dev_dbg(&port->dev, "%s - status = %d\n", __func__, status); /* error stop all */ return; } if (urb->actual_length == 1) len = (int) data[0]; if (urb->actual_length > 1) { dev_dbg(&port->dev, "%s - urb->actual_length = %i\n", __func__, urb->actual_length); return; } /* if len > 0 call readbuf */ if (len > 0) { dev_dbg(&port->dev, "%s - call read buf - len to read is %i\n", __func__, len); status = iuu_read_buf(port, len); return; } /* need to update status ? */ if (priv->poll > 99) { status = iuu_status(port); priv->poll = 0; return; } /* reset waiting ? */ if (priv->reset == 1) { status = iuu_reset(port, 0xC); return; } /* Writebuf is waiting */ spin_lock_irqsave(&priv->lock, flags); if (priv->writelen > 0) { spin_unlock_irqrestore(&priv->lock, flags); status = iuu_bulk_write(port); return; } spin_unlock_irqrestore(&priv->lock, flags); /* if nothing to write call again rxcmd */ dev_dbg(&port->dev, "%s - rxcmd recall\n", __func__); iuu_led_activity_off(urb); } static int iuu_uart_write(struct tty_struct *tty, struct usb_serial_port *port, const u8 *buf, int count) { struct iuu_private *priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); count = min(count, 256 - priv->writelen); if (count == 0) goto out; /* fill the buffer */ memcpy(priv->writebuf + priv->writelen, buf, count); priv->writelen += count; out: spin_unlock_irqrestore(&priv->lock, flags); return count; } static void read_rxcmd_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; int result; int status = urb->status; if (status) { /* error stop all */ return; } usb_fill_bulk_urb(port->read_urb, port->serial->dev, usb_rcvbulkpipe(port->serial->dev, port->bulk_in_endpointAddress), port->read_urb->transfer_buffer, 256, iuu_uart_read_callback, port); result = usb_submit_urb(port->read_urb, GFP_ATOMIC); dev_dbg(&port->dev, "%s - submit result = %d\n", __func__, result); } static int iuu_uart_on(struct usb_serial_port *port) { int status; u8 *buf; buf = kmalloc(4, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = IUU_UART_ENABLE; buf[1] = (u8) ((IUU_BAUD_9600 >> 8) & 0x00FF); buf[2] = (u8) (0x00FF & IUU_BAUD_9600); buf[3] = (u8) (0x0F0 & IUU_ONE_STOP_BIT) | (0x07 & IUU_PARITY_EVEN); status = bulk_immediate(port, buf, 4); if (status != IUU_OPERATION_OK) { dev_dbg(&port->dev, "%s - uart_on error\n", __func__); goto uart_enable_failed; } /* iuu_reset() the card after iuu_uart_on() */ status = iuu_uart_flush(port); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - uart_flush error\n", __func__); uart_enable_failed: kfree(buf); return status; } /* Disables the IUU UART (a.k.a. the Phoenix voiderface) */ static int iuu_uart_off(struct usb_serial_port *port) { int status; u8 *buf; buf = kmalloc(1, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = IUU_UART_DISABLE; status = bulk_immediate(port, buf, 1); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - uart_off error\n", __func__); kfree(buf); return status; } static int iuu_uart_baud(struct usb_serial_port *port, u32 baud_base, u32 *actual, u8 parity) { int status; u32 baud; u8 *dataout; u8 DataCount = 0; u8 T1Frekvens = 0; u8 T1reload = 0; unsigned int T1FrekvensHZ = 0; dev_dbg(&port->dev, "%s - enter baud_base=%d\n", __func__, baud_base); dataout = kmalloc(5, GFP_KERNEL); if (!dataout) return -ENOMEM; /*baud = (((priv->clk / 35) * baud_base) / 100000); */ baud = baud_base; if (baud < 1200 || baud > 230400) { kfree(dataout); return IUU_INVALID_PARAMETER; } if (baud > 977) { T1Frekvens = 3; T1FrekvensHZ = 500000; } if (baud > 3906) { T1Frekvens = 2; T1FrekvensHZ = 2000000; } if (baud > 11718) { T1Frekvens = 1; T1FrekvensHZ = 6000000; } if (baud > 46875) { T1Frekvens = 0; T1FrekvensHZ = 24000000; } T1reload = 256 - (u8) (T1FrekvensHZ / (baud * 2)); /* magic number here: ENTER_FIRMWARE_UPDATE; */ dataout[DataCount++] = IUU_UART_ESC; /* magic number here: CHANGE_BAUD; */ dataout[DataCount++] = IUU_UART_CHANGE; dataout[DataCount++] = T1Frekvens; dataout[DataCount++] = T1reload; *actual = (T1FrekvensHZ / (256 - T1reload)) / 2; switch (parity & 0x0F) { case IUU_PARITY_NONE: dataout[DataCount++] = 0x00; break; case IUU_PARITY_EVEN: dataout[DataCount++] = 0x01; break; case IUU_PARITY_ODD: dataout[DataCount++] = 0x02; break; case IUU_PARITY_MARK: dataout[DataCount++] = 0x03; break; case IUU_PARITY_SPACE: dataout[DataCount++] = 0x04; break; default: kfree(dataout); return IUU_INVALID_PARAMETER; } switch (parity & 0xF0) { case IUU_ONE_STOP_BIT: dataout[DataCount - 1] |= IUU_ONE_STOP_BIT; break; case IUU_TWO_STOP_BITS: dataout[DataCount - 1] |= IUU_TWO_STOP_BITS; break; default: kfree(dataout); return IUU_INVALID_PARAMETER; } status = bulk_immediate(port, dataout, DataCount); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - uart_off error\n", __func__); kfree(dataout); return status; } static void iuu_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { const u32 supported_mask = CMSPAR|PARENB|PARODD; struct iuu_private *priv = usb_get_serial_port_data(port); unsigned int cflag = tty->termios.c_cflag; int status; u32 actual; u32 parity; int csize = CS7; int baud; u32 newval = cflag & supported_mask; /* Just use the ospeed. ispeed should be the same. */ baud = tty->termios.c_ospeed; dev_dbg(&port->dev, "%s - enter c_ospeed or baud=%d\n", __func__, baud); /* compute the parity parameter */ parity = 0; if (cflag & CMSPAR) { /* Using mark space */ if (cflag & PARODD) parity |= IUU_PARITY_SPACE; else parity |= IUU_PARITY_MARK; } else if (!(cflag & PARENB)) { parity |= IUU_PARITY_NONE; csize = CS8; } else if (cflag & PARODD) parity |= IUU_PARITY_ODD; else parity |= IUU_PARITY_EVEN; parity |= (cflag & CSTOPB ? IUU_TWO_STOP_BITS : IUU_ONE_STOP_BIT); /* set it */ status = iuu_uart_baud(port, baud * priv->boost / 100, &actual, parity); /* set the termios value to the real one, so the user now what has * changed. We support few fields so its easies to copy the old hw * settings back over and then adjust them */ if (old_termios) tty_termios_copy_hw(&tty->termios, old_termios); if (status != 0) /* Set failed - return old bits */ return; /* Re-encode speed, parity and csize */ tty_encode_baud_rate(tty, baud, baud); tty->termios.c_cflag &= ~(supported_mask|CSIZE); tty->termios.c_cflag |= newval | csize; } static void iuu_close(struct usb_serial_port *port) { /* iuu_led (port,255,0,0,0); */ iuu_uart_off(port); usb_kill_urb(port->write_urb); usb_kill_urb(port->read_urb); iuu_led(port, 0, 0, 0xF000, 0xFF); } static void iuu_init_termios(struct tty_struct *tty) { tty->termios.c_cflag = B9600 | CS8 | CSTOPB | CREAD | PARENB | CLOCAL; tty->termios.c_ispeed = 9600; tty->termios.c_ospeed = 9600; tty->termios.c_lflag = 0; tty->termios.c_oflag = 0; tty->termios.c_iflag = 0; } static int iuu_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct device *dev = &port->dev; int result; int baud; u32 actual; struct iuu_private *priv = usb_get_serial_port_data(port); baud = tty->termios.c_ospeed; dev_dbg(dev, "%s - baud %d\n", __func__, baud); usb_clear_halt(serial->dev, port->write_urb->pipe); usb_clear_halt(serial->dev, port->read_urb->pipe); priv->poll = 0; #define SOUP(a, b, c, d) do { \ result = usb_control_msg(port->serial->dev, \ usb_sndctrlpipe(port->serial->dev, 0), \ b, a, c, d, NULL, 0, 1000); \ dev_dbg(dev, "0x%x:0x%x:0x%x:0x%x %d\n", a, b, c, d, result); } while (0) /* This is not UART related but IUU USB driver related or something */ /* like that. Basically no IUU will accept any commands from the USB */ /* host unless it has received the following message */ /* sprintf(buf ,"%c%c%c%c",0x03,0x02,0x02,0x0); */ SOUP(0x03, 0x02, 0x02, 0x0); iuu_led(port, 0xF000, 0xF000, 0, 0xFF); iuu_uart_on(port); if (boost < 100) boost = 100; priv->boost = boost; switch (clockmode) { case 2: /* 3.680 Mhz */ priv->clk = IUU_CLK_3680000; iuu_clk(port, IUU_CLK_3680000 * boost / 100); result = iuu_uart_baud(port, baud * boost / 100, &actual, IUU_PARITY_EVEN); break; case 3: /* 6.00 Mhz */ iuu_clk(port, IUU_CLK_6000000 * boost / 100); priv->clk = IUU_CLK_6000000; /* Ratio of 6000000 to 3500000 for baud 9600 */ result = iuu_uart_baud(port, 16457 * boost / 100, &actual, IUU_PARITY_EVEN); break; default: /* 3.579 Mhz */ iuu_clk(port, IUU_CLK_3579000 * boost / 100); priv->clk = IUU_CLK_3579000; result = iuu_uart_baud(port, baud * boost / 100, &actual, IUU_PARITY_EVEN); } /* set the cardin cardout signals */ switch (cdmode) { case 0: iuu_cardin = 0; iuu_cardout = 0; break; case 1: iuu_cardin = TIOCM_CD; iuu_cardout = 0; break; case 2: iuu_cardin = 0; iuu_cardout = TIOCM_CD; break; case 3: iuu_cardin = TIOCM_DSR; iuu_cardout = 0; break; case 4: iuu_cardin = 0; iuu_cardout = TIOCM_DSR; break; case 5: iuu_cardin = TIOCM_CTS; iuu_cardout = 0; break; case 6: iuu_cardin = 0; iuu_cardout = TIOCM_CTS; break; case 7: iuu_cardin = TIOCM_RNG; iuu_cardout = 0; break; case 8: iuu_cardin = 0; iuu_cardout = TIOCM_RNG; } iuu_uart_flush(port); dev_dbg(dev, "%s - initialization done\n", __func__); memset(port->write_urb->transfer_buffer, IUU_UART_RX, 1); usb_fill_bulk_urb(port->write_urb, port->serial->dev, usb_sndbulkpipe(port->serial->dev, port->bulk_out_endpointAddress), port->write_urb->transfer_buffer, 1, read_rxcmd_callback, port); result = usb_submit_urb(port->write_urb, GFP_KERNEL); if (result) { dev_err(dev, "%s - failed submitting read urb, error %d\n", __func__, result); iuu_close(port); } else { dev_dbg(dev, "%s - rxcmd OK\n", __func__); } return result; } /* how to change VCC */ static int iuu_vcc_set(struct usb_serial_port *port, unsigned int vcc) { int status; u8 *buf; buf = kmalloc(5, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = IUU_SET_VCC; buf[1] = vcc & 0xFF; buf[2] = (vcc >> 8) & 0xFF; buf[3] = (vcc >> 16) & 0xFF; buf[4] = (vcc >> 24) & 0xFF; status = bulk_immediate(port, buf, 5); kfree(buf); if (status != IUU_OPERATION_OK) dev_dbg(&port->dev, "%s - vcc error status = %2x\n", __func__, status); else dev_dbg(&port->dev, "%s - vcc OK !\n", __func__); return status; } /* * Sysfs Attributes */ static ssize_t vcc_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_serial_port *port = to_usb_serial_port(dev); struct iuu_private *priv = usb_get_serial_port_data(port); return sprintf(buf, "%d\n", priv->vcc); } static ssize_t vcc_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_serial_port *port = to_usb_serial_port(dev); struct iuu_private *priv = usb_get_serial_port_data(port); unsigned long v; if (kstrtoul(buf, 10, &v)) { dev_err(dev, "%s - vcc_mode: %s is not a unsigned long\n", __func__, buf); goto fail_store_vcc_mode; } dev_dbg(dev, "%s: setting vcc_mode = %ld\n", __func__, v); if ((v != 3) && (v != 5)) { dev_err(dev, "%s - vcc_mode %ld is invalid\n", __func__, v); } else { iuu_vcc_set(port, v); priv->vcc = v; } fail_store_vcc_mode: return count; } static DEVICE_ATTR_RW(vcc_mode); static int iuu_create_sysfs_attrs(struct usb_serial_port *port) { return device_create_file(&port->dev, &dev_attr_vcc_mode); } static int iuu_remove_sysfs_attrs(struct usb_serial_port *port) { device_remove_file(&port->dev, &dev_attr_vcc_mode); return 0; } /* * End Sysfs Attributes */ static struct usb_serial_driver iuu_device = { .driver = { .name = "iuu_phoenix", }, .id_table = id_table, .num_ports = 1, .num_bulk_in = 1, .num_bulk_out = 1, .bulk_in_size = 512, .bulk_out_size = 512, .open = iuu_open, .close = iuu_close, .write = iuu_uart_write, .read_bulk_callback = iuu_uart_read_callback, .tiocmget = iuu_tiocmget, .tiocmset = iuu_tiocmset, .set_termios = iuu_set_termios, .init_termios = iuu_init_termios, .port_probe = iuu_port_probe, .port_remove = iuu_port_remove, }; static struct usb_serial_driver * const serial_drivers[] = { &iuu_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_AUTHOR("Alain Degreffe eczema@ecze.com"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); module_param(xmas, bool, 0644); MODULE_PARM_DESC(xmas, "Xmas colors enabled or not"); module_param(boost, int, 0644); MODULE_PARM_DESC(boost, "Card overclock boost (in percent 100-500)"); module_param(clockmode, int, 0644); MODULE_PARM_DESC(clockmode, "Card clock mode (1=3.579 MHz, 2=3.680 MHz, " "3=6 Mhz)"); module_param(cdmode, int, 0644); MODULE_PARM_DESC(cdmode, "Card detect mode (0=none, 1=CD, 2=!CD, 3=DSR, " "4=!DSR, 5=CTS, 6=!CTS, 7=RING, 8=!RING)"); module_param(vcc_default, int, 0644); MODULE_PARM_DESC(vcc_default, "Set default VCC (either 3 for 3.3V or 5 " "for 5V). Default to 5.");
509 3 511 508 418 446 1730 420 1728 165 49 112 37 573 301 409 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_nat_masquerade.h> struct masq_dev_work { struct work_struct work; struct net *net; netns_tracker ns_tracker; union nf_inet_addr addr; int ifindex; int (*iter)(struct nf_conn *i, void *data); }; #define MAX_MASQ_WORKER_COUNT 16 static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; rt = skb_rtable(skb); nh = rt_nexthop(rt, ip_hdr(skb)->daddr); newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); static void iterate_cleanup_work(struct work_struct *work) { struct nf_ct_iter_data iter_data = {}; struct masq_dev_work *w; w = container_of(work, struct masq_dev_work, work); iter_data.net = w->net; iter_data.data = (void *)w; nf_ct_iterate_cleanup_net(w->iter, &iter_data); put_net_track(w->net, &w->ns_tracker); kfree(w); atomic_dec(&masq_worker_count); module_put(THIS_MODULE); } /* Iterate conntrack table in the background and remove conntrack entries * that use the device/address being removed. * * In case too many work items have been queued already or memory allocation * fails iteration is skipped, conntrack entries will time out eventually. */ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, int ifindex, int (*iter)(struct nf_conn *i, void *data), gfp_t gfp_flags) { struct masq_dev_work *w; if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) return; net = maybe_get_net(net); if (!net) return; if (!try_module_get(THIS_MODULE)) goto err_module; w = kzalloc(sizeof(*w), gfp_flags); if (w) { /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ atomic_inc(&masq_worker_count); INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = ifindex; w->net = net; netns_tracker_alloc(net, &w->ns_tracker, gfp_flags); w->iter = iter; if (addr) w->addr = *addr; schedule_work(&w->work); return; } module_put(THIS_MODULE); err_module: put_net(net); } static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); const struct masq_dev_work *w = arg; if (!nat) return 0; return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for * conntracks which were associated with that device, * and forget them. */ nf_nat_masq_schedule(net, NULL, dev->ifindex, device_cmp, GFP_KERNEL); } return NOTIFY_DONE; } static int inet_cmp(struct nf_conn *ct, void *ptr) { struct nf_conntrack_tuple *tuple; struct masq_dev_work *w = ptr; if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct in_ifaddr *ifa = ptr; const struct in_device *idev; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; memset(&addr, 0, sizeof(addr)); addr.ip = ifa->ifa_address; dev = idev->dev; nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr) { #ifdef CONFIG_IPV6_MODULE const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return -EHOSTUNREACH; return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); #else return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); #endif } unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.in6 = src; newrange.max_addr.in6 = src; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. * * As we can have 'a lot' of inet_events (depending on amount of ipv6 * addresses being deleted), we also need to limit work item queue. */ static int masq_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; memset(&addr, 0, sizeof(addr)); addr.in6 = ifa->addr; nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, GFP_ATOMIC); return NOTIFY_DONE; } static struct notifier_block masq_inet6_notifier = { .notifier_call = masq_inet6_event, }; static int nf_nat_masquerade_ipv6_register_notifier(void) { return register_inet6addr_notifier(&masq_inet6_notifier); } #else static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } #endif int nf_nat_masquerade_inet_register_notifiers(void) { int ret = 0; mutex_lock(&masq_mutex); if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { ret = -EOVERFLOW; goto out_unlock; } /* check if the notifier was already set */ if (++masq_refcnt > 1) goto out_unlock; /* Register for device down reports */ ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; /* Register IP address change reports */ ret = register_inetaddr_notifier(&masq_inet_notifier); if (ret) goto err_unregister; ret = nf_nat_masquerade_ipv6_register_notifier(); if (ret) goto err_unreg_inet; mutex_unlock(&masq_mutex); return ret; err_unreg_inet: unregister_inetaddr_notifier(&masq_inet_notifier); err_unregister: unregister_netdevice_notifier(&masq_dev_notifier); err_dec: masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); void nf_nat_masquerade_inet_unregister_notifiers(void) { mutex_lock(&masq_mutex); /* check if the notifiers still have clients */ if (--masq_refcnt > 0) goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&masq_inet6_notifier); #endif out_unlock: mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);
2 2 2 2 8 7 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_object *obj = nft_objref_priv(expr); obj->ops->eval(obj, regs, pkt); } static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_object *obj = nft_objref_priv(expr); u8 genmask = nft_genmask_next(ctx->net); u32 objtype; if (!tb[NFTA_OBJREF_IMM_NAME] || !tb[NFTA_OBJREF_IMM_TYPE]) return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); obj = nft_obj_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, genmask); if (IS_ERR(obj)) return -ENOENT; if (!nft_use_inc(&obj->use)) return -EMFILE; nft_objref_priv(expr) = obj; return 0; } static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_object *obj = nft_objref_priv(expr); if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->ops->type->type))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_objref_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_object *obj = nft_objref_priv(expr); if (phase == NFT_TRANS_COMMIT) return; nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_object *obj = nft_objref_priv(expr); nft_use_inc_restore(&obj->use); } static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), .eval = nft_objref_eval, .init = nft_objref_init, .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_objref_map { struct nft_set *set; u8 sreg; struct nft_set_binding binding; }; void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; bool found; found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext); if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; return; } } obj = *nft_set_ext_obj(ext); obj->ops->eval(obj, regs, pkt); } static int nft_objref_map_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_objref_map *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; int err; set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME], tb[NFTA_OBJREF_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg, set->klen); if (err < 0) return err; priv->binding.flags = set->flags & NFT_SET_OBJECT; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; priv->set = set; return 0; } static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_objref_map *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) || nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_objref_map_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_objref_map_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); nf_tables_destroy_set(ctx, priv->set); } static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), .eval = nft_objref_map_eval, .init = nft_objref_map_init, .activate = nft_objref_map_activate, .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_objref_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_OBJREF_SET_SREG] && (tb[NFTA_OBJREF_SET_NAME] || tb[NFTA_OBJREF_SET_ID])) return &nft_objref_map_ops; else if (tb[NFTA_OBJREF_IMM_NAME] && tb[NFTA_OBJREF_IMM_TYPE]) return &nft_objref_ops; return ERR_PTR(-EOPNOTSUPP); } static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, };
409 408 409 116 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 // SPDX-License-Identifier: GPL-2.0 /* * Implement mseal() syscall. * * Copyright (c) 2023,2024 Google, Inc. * * Author: Jeff Xu <jeffxu@chromium.org> */ #include <linux/mempolicy.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_context.h> #include <linux/syscalls.h> #include <linux/sched.h> #include "internal.h" static inline void set_vma_sealed(struct vm_area_struct *vma) { vm_flags_set(vma, VM_SEALED); } static bool is_madv_discard(int behavior) { switch (behavior) { case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: case MADV_GUARD_INSTALL: return true; } return false; } static bool is_ro_anon(struct vm_area_struct *vma) { /* check anonymous mapping. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return false; /* * check for non-writable: * PROT=RO or PKRU is not writeable. */ if (!(vma->vm_flags & VM_WRITE) || !arch_vma_access_permitted(vma, true, false, false)) return true; return false; } /* * Check if a vma is allowed to be modified by madvise. */ bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { if (!is_madv_discard(behavior)) return true; if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) return false; /* Allow by default. */ return true; } static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { int ret = 0; vm_flags_t oldflags = vma->vm_flags; if (newflags == oldflags) goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } set_vma_sealed(vma); out: *prev = vma; return ret; } /* * Check for do_mseal: * 1> start is part of a valid vma. * 2> end is part of a valid vma. * 3> No gap (unallocated address) between start and end. * 4> map is sealable. */ static int check_mm_seal(unsigned long start, unsigned long end) { struct vm_area_struct *vma; unsigned long nstart = start; VMA_ITERATOR(vmi, current->mm, start); /* going through each vma to check. */ for_each_vma_range(vmi, vma, end) { if (vma->vm_start > nstart) /* unallocated memory found. */ return -ENOMEM; if (vma->vm_end >= end) return 0; nstart = vma->vm_end; } return -ENOMEM; } /* * Apply sealing. */ static int apply_mm_seal(unsigned long start, unsigned long end) { unsigned long nstart; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); vma = vma_iter_load(&vmi); /* * Note: check_mm_seal should already checked ENOMEM case. * so vma should not be null, same for the other ENOMEM cases. */ prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; for_each_vma_range(vmi, vma, end) { int error; unsigned long tmp; vm_flags_t newflags; newflags = vma->vm_flags | VM_SEALED; tmp = vma->vm_end; if (tmp > end) tmp = end; error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; nstart = vma_iter_end(&vmi); } return 0; } /* * mseal(2) seals the VM's meta data from * selected syscalls. * * addr/len: VM address range. * * The address range by addr/len must meet: * start (addr) must be in a valid VMA. * end (addr + len) must be in a valid VMA. * no gap (unallocated memory) between start and end. * start (addr) must be page aligned. * * len: len will be page aligned implicitly. * * Below VMA operations are blocked after sealing. * 1> Unmapping, moving to another location, and shrinking * the size, via munmap() and mremap(), can leave an empty * space, therefore can be replaced with a VMA with a new * set of attributes. * 2> Moving or expanding a different vma into the current location, * via mremap(). * 3> Modifying a VMA via mmap(MAP_FIXED). * 4> Size expansion, via mremap(), does not appear to pose any * specific risks to sealed VMAs. It is included anyway because * the use case is unclear. In any case, users can rely on * merging to expand a sealed VMA. * 5> mprotect and pkey_mprotect. * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) * for anonymous memory, when users don't have write permission to the * memory. Those behaviors can alter region contents by discarding pages, * effectively a memset(0) for anonymous memory. * * flags: reserved. * * return values: * zero: success. * -EINVAL: * invalid input flags. * start address is not page aligned. * Address arange (start + len) overflow. * -ENOMEM: * addr is not a valid address (not allocated). * end (start + len) is not a valid address. * a gap (unallocated memory) between start and end. * -EPERM: * - In 32 bit architecture, sealing is not supported. * Note: * user can call mseal(2) multiple times, adding a seal on an * already sealed memory is a no-action (no error). * * unseal() is not supported. */ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { size_t len; int ret = 0; unsigned long end; struct mm_struct *mm = current->mm; ret = can_do_mseal(flags); if (ret) return ret; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero. */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; if (mmap_write_lock_killable(mm)) return -EINTR; /* * First pass, this helps to avoid * partial sealing in case of error in input address range, * e.g. ENOMEM error. */ ret = check_mm_seal(start, end); if (ret) goto out; /* * Second pass, this should success, unless there are errors * from vma_modify_flags, e.g. merge/split error, or process * reaching the max supported VMAs, however, those cases shall * be rare. */ ret = apply_mm_seal(start, end); out: mmap_write_unlock(current->mm); return ret; } SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, flags) { return do_mseal(start, len, flags); }
9 17 43 41 3 38 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 // SPDX-License-Identifier: GPL-2.0-or-later /* Key garbage collector * * Copyright (C) 2009-2011 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/security.h> #include <keys/keyring-type.h> #include "internal.h" /* * Delay between key revocation/expiry in seconds */ unsigned key_gc_delay = 5 * 60; /* * Reaper for unused keys. */ static void key_garbage_collector(struct work_struct *work); DECLARE_WORK(key_gc_work, key_garbage_collector); /* * Reaper for links from keyrings to dead keys. */ static void key_gc_timer_func(struct timer_list *); static DEFINE_TIMER(key_gc_timer, key_gc_timer_func); static time64_t key_gc_next_run = TIME64_MAX; static struct key_type *key_gc_dead_keytype; static unsigned long key_gc_flags; #define KEY_GC_KEY_EXPIRED 0 /* A key expired and needs unlinking */ #define KEY_GC_REAP_KEYTYPE 1 /* A keytype is being unregistered */ #define KEY_GC_REAPING_KEYTYPE 2 /* Cleared when keytype reaped */ /* * Any key whose type gets unregistered will be re-typed to this if it can't be * immediately unlinked. */ struct key_type key_type_dead = { .name = ".dead", }; /* * Schedule a garbage collection run. * - time precision isn't particularly important */ void key_schedule_gc(time64_t gc_at) { unsigned long expires; time64_t now = ktime_get_real_seconds(); kenter("%lld", gc_at - now); if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) { kdebug("IMMEDIATE"); schedule_work(&key_gc_work); } else if (gc_at < key_gc_next_run) { kdebug("DEFERRED"); key_gc_next_run = gc_at; expires = jiffies + (gc_at - now) * HZ; mod_timer(&key_gc_timer, expires); } } /* * Set the expiration time on a key. */ void key_set_expiry(struct key *key, time64_t expiry) { key->expiry = expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; key_schedule_gc(expiry); } } /* * Schedule a dead links collection run. */ void key_schedule_gc_links(void) { set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags); schedule_work(&key_gc_work); } /* * Some key's cleanup time was met after it expired, so we need to get the * reaper to go through a cycle finding expired keys. */ static void key_gc_timer_func(struct timer_list *unused) { kenter(""); key_gc_next_run = TIME64_MAX; key_schedule_gc_links(); } /* * Reap keys of dead type. * * We use three flags to make sure we see three complete cycles of the garbage * collector: the first to mark keys of that type as being dead, the second to * collect dead links and the third to clean up the dead keys. We have to be * careful as there may already be a cycle in progress. * * The caller must be holding key_types_sem. */ void key_gc_keytype(struct key_type *ktype) { kenter("%s", ktype->name); key_gc_dead_keytype = ktype; set_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); smp_mb(); set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags); kdebug("schedule"); schedule_work(&key_gc_work); kdebug("sleep"); wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, TASK_UNINTERRUPTIBLE); key_gc_dead_keytype = NULL; kleave(""); } /* * Garbage collect a list of unreferenced, detached keys */ static noinline void key_gc_unused_keys(struct list_head *keys) { while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); short state = key->state; list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); #ifdef CONFIG_KEY_NOTIFICATIONS remove_watch_list(key->watchers, key->serial); key->watchers = NULL; #endif /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); atomic_dec(&key->user->nkeys); if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); key_put_tag(key->domain_tag); kfree(key->description); memzero_explicit(key, sizeof(*key)); kmem_cache_free(key_jar, key); } } /* * Garbage collector for unused keys. * * This is done in process context so that we don't have to disable interrupts * all over the place. key_put() schedules this rather than trying to do the * cleanup itself, which means key_put() doesn't have to sleep. */ static void key_garbage_collector(struct work_struct *work) { static LIST_HEAD(graveyard); static u8 gc_state; /* Internal persistent state */ #define KEY_GC_REAP_AGAIN 0x01 /* - Need another cycle */ #define KEY_GC_REAPING_LINKS 0x02 /* - We need to reap links */ #define KEY_GC_REAPING_DEAD_1 0x10 /* - We need to mark dead keys */ #define KEY_GC_REAPING_DEAD_2 0x20 /* - We need to reap dead key links */ #define KEY_GC_REAPING_DEAD_3 0x40 /* - We need to reap dead keys */ #define KEY_GC_FOUND_DEAD_KEY 0x80 /* - We found at least one dead key */ struct rb_node *cursor; struct key *key; time64_t new_timer, limit, expiry; kenter("[%lx,%x]", key_gc_flags, gc_state); limit = ktime_get_real_seconds(); /* Work out what we're going to be doing in this pass */ gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2; gc_state <<= 1; if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags)) gc_state |= KEY_GC_REAPING_LINKS; if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) gc_state |= KEY_GC_REAPING_DEAD_1; kdebug("new pass %x", gc_state); new_timer = TIME64_MAX; /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); cursor = rb_first(&key_serial_tree); continue_scanning: while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); if (refcount_read(&key->usage) == 0) goto found_unreferenced_key; if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { goto found_restricted_keyring; } } expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry > limit && expiry < new_timer) { kdebug("will expire %x in %lld", key_serial(key), key->expiry - limit); new_timer = key->expiry; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) if (key->type == key_gc_dead_keytype) gc_state |= KEY_GC_FOUND_DEAD_KEY; if ((gc_state & KEY_GC_REAPING_LINKS) || unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) { if (key->type == &key_type_keyring) goto found_keyring; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) if (key->type == key_gc_dead_keytype) goto destroy_dead_key; skip_dead_key: if (spin_is_contended(&key_serial_lock) || need_resched()) goto contended; } contended: spin_unlock(&key_serial_lock); maybe_resched: if (cursor) { cond_resched(); spin_lock(&key_serial_lock); goto continue_scanning; } /* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one * where we didn't reap any keys. */ kdebug("pass complete"); if (new_timer != TIME64_MAX) { new_timer += key_gc_delay; key_schedule_gc(new_timer); } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) || !list_empty(&graveyard)) { /* Make sure that all pending keyring payload destructions are * fulfilled and that people aren't now looking at dead or * dying keys that they don't have a reference upon or a link * to. */ kdebug("gc sync"); synchronize_rcu(); } if (!list_empty(&graveyard)) { kdebug("gc keys"); key_gc_unused_keys(&graveyard); } if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2))) { if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) { /* No remaining dead keys: short circuit the remaining * keytype reap cycles. */ kdebug("dead short"); gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2); gc_state |= KEY_GC_REAPING_DEAD_3; } else { gc_state |= KEY_GC_REAP_AGAIN; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) { kdebug("dead wake"); smp_mb(); clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE); } if (gc_state & KEY_GC_REAP_AGAIN) schedule_work(&key_gc_work); kleave(" [end %x]", gc_state); return; /* We found an unreferenced key - once we've removed it from the tree, * we can safely drop the lock. */ found_unreferenced_key: kdebug("unrefd key %d", key->serial); rb_erase(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); list_add_tail(&key->graveyard_link, &graveyard); gc_state |= KEY_GC_REAP_AGAIN; goto maybe_resched; /* We found a restricted keyring and need to update the restriction if * it is associated with the dead key type. */ found_restricted_keyring: spin_unlock(&key_serial_lock); keyring_restriction_gc(key, key_gc_dead_keytype); goto maybe_resched; /* We found a keyring and we need to check the payload for links to * dead or expired keys. We don't flag another reap immediately as we * have to wait for the old payload to be destroyed by RCU before we * can reap the keys to which it refers. */ found_keyring: spin_unlock(&key_serial_lock); keyring_gc(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and * destroy its payload with its semaphore held. */ destroy_dead_key: spin_unlock(&key_serial_lock); kdebug("destroy key %d", key->serial); down_write(&key->sem); key->type = &key_type_dead; if (key_gc_dead_keytype->destroy) key_gc_dead_keytype->destroy(key); memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); up_write(&key->sem); goto maybe_resched; }
4 11 13 6 4 3 1 1 1 1 1 7 7 1 1 2 13 13 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Bluetooth virtual HCI driver * * Copyright (C) 2000-2001 Qualcomm Incorporated * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> * Copyright (C) 2004-2006 Marcel Holtmann <marcel@holtmann.org> */ #include <linux/module.h> #include <linux/unaligned.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/poll.h> #include <linux/skbuff.h> #include <linux/miscdevice.h> #include <linux/debugfs.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #define VERSION "1.5" static bool amp; struct vhci_data { struct hci_dev *hdev; wait_queue_head_t read_wait; struct sk_buff_head readq; struct mutex open_mutex; struct delayed_work open_timeout; struct work_struct suspend_work; bool suspended; bool wakeup; __u16 msft_opcode; bool aosp_capable; atomic_t initialized; }; static int vhci_open_dev(struct hci_dev *hdev) { return 0; } static int vhci_close_dev(struct hci_dev *hdev) { struct vhci_data *data = hci_get_drvdata(hdev); skb_queue_purge(&data->readq); return 0; } static int vhci_flush(struct hci_dev *hdev) { struct vhci_data *data = hci_get_drvdata(hdev); skb_queue_purge(&data->readq); return 0; } static int vhci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { struct vhci_data *data = hci_get_drvdata(hdev); memcpy(skb_push(skb, 1), &hci_skb_pkt_type(skb), 1); skb_queue_tail(&data->readq, skb); if (atomic_read(&data->initialized)) wake_up_interruptible(&data->read_wait); return 0; } static int vhci_get_data_path_id(struct hci_dev *hdev, u8 *data_path_id) { *data_path_id = 0; return 0; } static int vhci_get_codec_config_data(struct hci_dev *hdev, __u8 type, struct bt_codec *codec, __u8 *vnd_len, __u8 **vnd_data) { if (type != ESCO_LINK) return -EINVAL; *vnd_len = 0; *vnd_data = NULL; return 0; } static bool vhci_wakeup(struct hci_dev *hdev) { struct vhci_data *data = hci_get_drvdata(hdev); return data->wakeup; } static ssize_t force_suspend_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; char buf[3]; buf[0] = data->suspended ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static void vhci_suspend_work(struct work_struct *work) { struct vhci_data *data = container_of(work, struct vhci_data, suspend_work); if (data->suspended) hci_suspend_dev(data->hdev); else hci_resume_dev(data->hdev); } static ssize_t force_suspend_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; bool enable; int err; err = kstrtobool_from_user(user_buf, count, &enable); if (err) return err; if (data->suspended == enable) return -EALREADY; data->suspended = enable; schedule_work(&data->suspend_work); return count; } static const struct file_operations force_suspend_fops = { .open = simple_open, .read = force_suspend_read, .write = force_suspend_write, .llseek = default_llseek, }; static ssize_t force_wakeup_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; char buf[3]; buf[0] = data->wakeup ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static ssize_t force_wakeup_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; bool enable; int err; err = kstrtobool_from_user(user_buf, count, &enable); if (err) return err; if (data->wakeup == enable) return -EALREADY; data->wakeup = enable; return count; } static const struct file_operations force_wakeup_fops = { .open = simple_open, .read = force_wakeup_read, .write = force_wakeup_write, .llseek = default_llseek, }; static int msft_opcode_set(void *data, u64 val) { struct vhci_data *vhci = data; if (val > 0xffff || hci_opcode_ogf(val) != 0x3f) return -EINVAL; if (vhci->msft_opcode) return -EALREADY; vhci->msft_opcode = val; return 0; } static int msft_opcode_get(void *data, u64 *val) { struct vhci_data *vhci = data; *val = vhci->msft_opcode; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(msft_opcode_fops, msft_opcode_get, msft_opcode_set, "%llu\n"); static ssize_t aosp_capable_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *vhci = file->private_data; char buf[3]; buf[0] = vhci->aosp_capable ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } static ssize_t aosp_capable_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *vhci = file->private_data; bool enable; int err; err = kstrtobool_from_user(user_buf, count, &enable); if (err) return err; if (!enable) return -EINVAL; if (vhci->aosp_capable) return -EALREADY; vhci->aosp_capable = enable; return count; } static const struct file_operations aosp_capable_fops = { .open = simple_open, .read = aosp_capable_read, .write = aosp_capable_write, .llseek = default_llseek, }; static int vhci_setup(struct hci_dev *hdev) { struct vhci_data *vhci = hci_get_drvdata(hdev); if (vhci->msft_opcode) hci_set_msft_opcode(hdev, vhci->msft_opcode); if (vhci->aosp_capable) hci_set_aosp_capable(hdev); return 0; } static void vhci_coredump(struct hci_dev *hdev) { /* No need to do anything */ } static void vhci_coredump_hdr(struct hci_dev *hdev, struct sk_buff *skb) { char buf[80]; snprintf(buf, sizeof(buf), "Controller Name: vhci_ctrl\n"); skb_put_data(skb, buf, strlen(buf)); snprintf(buf, sizeof(buf), "Firmware Version: vhci_fw\n"); skb_put_data(skb, buf, strlen(buf)); snprintf(buf, sizeof(buf), "Driver: vhci_drv\n"); skb_put_data(skb, buf, strlen(buf)); snprintf(buf, sizeof(buf), "Vendor: vhci\n"); skb_put_data(skb, buf, strlen(buf)); } #define MAX_COREDUMP_LINE_LEN 40 struct devcoredump_test_data { enum devcoredump_state state; unsigned int timeout; char data[MAX_COREDUMP_LINE_LEN]; }; static inline void force_devcd_timeout(struct hci_dev *hdev, unsigned int timeout) { #ifdef CONFIG_DEV_COREDUMP hdev->dump.timeout = msecs_to_jiffies(timeout * 1000); #endif } static ssize_t force_devcd_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct vhci_data *data = file->private_data; struct hci_dev *hdev = data->hdev; struct sk_buff *skb = NULL; struct devcoredump_test_data dump_data; size_t data_size; int ret; if (count < offsetof(struct devcoredump_test_data, data) || count > sizeof(dump_data)) return -EINVAL; if (copy_from_user(&dump_data, user_buf, count)) return -EFAULT; data_size = count - offsetof(struct devcoredump_test_data, data); skb = alloc_skb(data_size, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_put_data(skb, &dump_data.data, data_size); hci_devcd_register(hdev, vhci_coredump, vhci_coredump_hdr, NULL); /* Force the devcoredump timeout */ if (dump_data.timeout) force_devcd_timeout(hdev, dump_data.timeout); ret = hci_devcd_init(hdev, skb->len); if (ret) { BT_ERR("Failed to generate devcoredump"); kfree_skb(skb); return ret; } hci_devcd_append(hdev, skb); switch (dump_data.state) { case HCI_DEVCOREDUMP_DONE: hci_devcd_complete(hdev); break; case HCI_DEVCOREDUMP_ABORT: hci_devcd_abort(hdev); break; case HCI_DEVCOREDUMP_TIMEOUT: /* Do nothing */ break; default: return -EINVAL; } return count; } static const struct file_operations force_devcoredump_fops = { .open = simple_open, .write = force_devcd_write, }; static int __vhci_create_device(struct vhci_data *data, __u8 opcode) { struct hci_dev *hdev; struct sk_buff *skb; if (data->hdev) return -EBADFD; /* bits 2-5 are reserved (must be zero) */ if (opcode & 0x3c) return -EINVAL; skb = bt_skb_alloc(4, GFP_KERNEL); if (!skb) return -ENOMEM; hdev = hci_alloc_dev(); if (!hdev) { kfree_skb(skb); return -ENOMEM; } data->hdev = hdev; hdev->bus = HCI_VIRTUAL; hci_set_drvdata(hdev, data); hdev->open = vhci_open_dev; hdev->close = vhci_close_dev; hdev->flush = vhci_flush; hdev->send = vhci_send_frame; hdev->get_data_path_id = vhci_get_data_path_id; hdev->get_codec_config_data = vhci_get_codec_config_data; hdev->wakeup = vhci_wakeup; hdev->setup = vhci_setup; set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); /* bit 6 is for external configuration */ if (opcode & 0x40) set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); /* bit 7 is for raw device */ if (opcode & 0x80) set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); hci_free_dev(hdev); data->hdev = NULL; kfree_skb(skb); return -EBUSY; } debugfs_create_file("force_suspend", 0644, hdev->debugfs, data, &force_suspend_fops); debugfs_create_file("force_wakeup", 0644, hdev->debugfs, data, &force_wakeup_fops); if (IS_ENABLED(CONFIG_BT_MSFTEXT)) debugfs_create_file("msft_opcode", 0644, hdev->debugfs, data, &msft_opcode_fops); if (IS_ENABLED(CONFIG_BT_AOSPEXT)) debugfs_create_file("aosp_capable", 0644, hdev->debugfs, data, &aosp_capable_fops); debugfs_create_file("force_devcoredump", 0644, hdev->debugfs, data, &force_devcoredump_fops); hci_skb_pkt_type(skb) = HCI_VENDOR_PKT; skb_put_u8(skb, 0xff); skb_put_u8(skb, opcode); put_unaligned_le16(hdev->id, skb_put(skb, 2)); skb_queue_head(&data->readq, skb); atomic_inc(&data->initialized); wake_up_interruptible(&data->read_wait); return 0; } static int vhci_create_device(struct vhci_data *data, __u8 opcode) { int err; mutex_lock(&data->open_mutex); err = __vhci_create_device(data, opcode); mutex_unlock(&data->open_mutex); return err; } static inline ssize_t vhci_get_user(struct vhci_data *data, struct iov_iter *from) { size_t len = iov_iter_count(from); struct sk_buff *skb; __u8 pkt_type, opcode; int ret; if (len < 2 || len > HCI_MAX_FRAME_SIZE) return -EINVAL; skb = bt_skb_alloc(len, GFP_KERNEL); if (!skb) return -ENOMEM; if (!copy_from_iter_full(skb_put(skb, len), len, from)) { kfree_skb(skb); return -EFAULT; } pkt_type = *((__u8 *) skb->data); skb_pull(skb, 1); switch (pkt_type) { case HCI_EVENT_PKT: case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: if (!data->hdev) { kfree_skb(skb); return -ENODEV; } hci_skb_pkt_type(skb) = pkt_type; ret = hci_recv_frame(data->hdev, skb); break; case HCI_VENDOR_PKT: cancel_delayed_work_sync(&data->open_timeout); opcode = *((__u8 *) skb->data); skb_pull(skb, 1); if (skb->len > 0) { kfree_skb(skb); return -EINVAL; } kfree_skb(skb); ret = vhci_create_device(data, opcode); break; default: kfree_skb(skb); return -EINVAL; } return (ret < 0) ? ret : len; } static inline ssize_t vhci_put_user(struct vhci_data *data, struct sk_buff *skb, char __user *buf, int count) { char __user *ptr = buf; int len; len = min_t(unsigned int, skb->len, count); if (copy_to_user(ptr, skb->data, len)) return -EFAULT; if (!data->hdev) return len; data->hdev->stat.byte_tx += len; switch (hci_skb_pkt_type(skb)) { case HCI_COMMAND_PKT: data->hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: data->hdev->stat.acl_tx++; break; case HCI_SCODATA_PKT: data->hdev->stat.sco_tx++; break; } return len; } static ssize_t vhci_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct vhci_data *data = file->private_data; struct sk_buff *skb; ssize_t ret = 0; while (count) { skb = skb_dequeue(&data->readq); if (skb) { ret = vhci_put_user(data, skb, buf, count); if (ret < 0) skb_queue_head(&data->readq, skb); else kfree_skb(skb); break; } if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } ret = wait_event_interruptible(data->read_wait, !skb_queue_empty(&data->readq)); if (ret < 0) break; } return ret; } static ssize_t vhci_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhci_data *data = file->private_data; return vhci_get_user(data, from); } static __poll_t vhci_poll(struct file *file, poll_table *wait) { struct vhci_data *data = file->private_data; poll_wait(file, &data->read_wait, wait); if (!skb_queue_empty(&data->readq)) return EPOLLIN | EPOLLRDNORM; return EPOLLOUT | EPOLLWRNORM; } static void vhci_open_timeout(struct work_struct *work) { struct vhci_data *data = container_of(work, struct vhci_data, open_timeout.work); vhci_create_device(data, 0x00); } static int vhci_open(struct inode *inode, struct file *file) { struct vhci_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; skb_queue_head_init(&data->readq); init_waitqueue_head(&data->read_wait); mutex_init(&data->open_mutex); INIT_DELAYED_WORK(&data->open_timeout, vhci_open_timeout); INIT_WORK(&data->suspend_work, vhci_suspend_work); file->private_data = data; nonseekable_open(inode, file); schedule_delayed_work(&data->open_timeout, msecs_to_jiffies(1000)); return 0; } static int vhci_release(struct inode *inode, struct file *file) { struct vhci_data *data = file->private_data; struct hci_dev *hdev; cancel_delayed_work_sync(&data->open_timeout); flush_work(&data->suspend_work); hdev = data->hdev; if (hdev) { hci_unregister_dev(hdev); hci_free_dev(hdev); } skb_queue_purge(&data->readq); file->private_data = NULL; kfree(data); return 0; } static const struct file_operations vhci_fops = { .owner = THIS_MODULE, .read = vhci_read, .write_iter = vhci_write, .poll = vhci_poll, .open = vhci_open, .release = vhci_release, }; static struct miscdevice vhci_miscdev = { .name = "vhci", .fops = &vhci_fops, .minor = VHCI_MINOR, }; module_misc_device(vhci_miscdev); module_param(amp, bool, 0644); MODULE_PARM_DESC(amp, "Create AMP controller device"); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth virtual HCI driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS("devname:vhci"); MODULE_ALIAS_MISCDEV(VHCI_MINOR);
26 29 44 52 44 23 26 26 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* * llc_core.c - Minimum needed routines for sap handling and module init/exit * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/llc.h> LIST_HEAD(llc_sap_list); static DEFINE_SPINLOCK(llc_sap_list_lock); /** * llc_sap_alloc - allocates and initializes sap. * * Allocates and initializes sap. */ static struct llc_sap *llc_sap_alloc(void) { struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC); int i; if (sap) { /* sap->laddr.mac - leave as a null, it's filled by bind */ sap->state = LLC_SAP_STATE_ACTIVE; spin_lock_init(&sap->sk_lock); for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i); refcount_set(&sap->refcnt, 1); } return sap; } static struct llc_sap *__llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; list_for_each_entry(sap, &llc_sap_list, node) if (sap->laddr.lsap == sap_value) goto out; sap = NULL; out: return sap; } /** * llc_sap_find - searches a SAP in station * @sap_value: sap to be found * * Searches for a sap in the sap list of the LLC's station upon the sap ID. * If the sap is found it will be refcounted and the user will have to do * a llc_sap_put after use. * Returns the sap or %NULL if not found. */ struct llc_sap *llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; rcu_read_lock_bh(); sap = __llc_sap_find(sap_value); if (!sap || !llc_sap_hold_safe(sap)) sap = NULL; rcu_read_unlock_bh(); return sap; } /** * llc_sap_open - open interface to the upper layers. * @lsap: SAP number. * @func: rcv func for datalink protos * * Interface function to upper layer. Each one who wants to get a SAP * (for example NetBEUI) should call this function. Returns the opened * SAP for success, NULL for failure. */ struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)) { struct llc_sap *sap = NULL; spin_lock_bh(&llc_sap_list_lock); if (__llc_sap_find(lsap)) /* SAP already exists */ goto out; sap = llc_sap_alloc(); if (!sap) goto out; sap->laddr.lsap = lsap; sap->rcv_func = func; list_add_tail_rcu(&sap->node, &llc_sap_list); out: spin_unlock_bh(&llc_sap_list_lock); return sap; } /** * llc_sap_close - close interface for upper layers. * @sap: SAP to be closed. * * Close interface function to upper layer. Each one who wants to * close an open SAP (for example NetBEUI) should call this function. * Removes this sap from the list of saps in the station and then * frees the memory for this sap. */ void llc_sap_close(struct llc_sap *sap) { WARN_ON(sap->sk_count); spin_lock_bh(&llc_sap_list_lock); list_del_rcu(&sap->node); spin_unlock_bh(&llc_sap_list_lock); kfree_rcu(sap, rcu); } static struct packet_type llc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_2), .func = llc_rcv, }; static int __init llc_init(void) { dev_add_pack(&llc_packet_type); return 0; } static void __exit llc_exit(void) { dev_remove_pack(&llc_packet_type); } module_init(llc_init); module_exit(llc_exit); EXPORT_SYMBOL(llc_sap_list); EXPORT_SYMBOL(llc_sap_find); EXPORT_SYMBOL(llc_sap_open); EXPORT_SYMBOL(llc_sap_close); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("LLC IEEE 802.2 core support");
9 8 1 8 1 11 4 11 1 8 11 1 9 11 2 9 7 1 2 10 11 11 11 2 11 6 12 12 1 1 3 10 6 8 14 4 14 4 1 4 4 1 1 1 2 9 11 1 10 5 2 3 21 3 5 16 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp diag support. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/sctp/sctp.h> static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info); /* define some functions to make asoc/ep fill look clean */ static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, struct sock *sk, struct sctp_association *asoc) { union sctp_addr laddr, paddr; struct dst_entry *dst; struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer; laddr = list_entry(asoc->base.bind_addr.address_list.next, struct sctp_sockaddr_entry, list)->a; paddr = asoc->peer.primary_path->ipaddr; dst = asoc->peer.primary_path->dst; r->idiag_family = sk->sk_family; r->id.idiag_sport = htons(asoc->base.bind_addr.port); r->id.idiag_dport = htons(asoc->peer.port); r->id.idiag_if = dst ? dst->dev->ifindex : 0; sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr; *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr; } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr; r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr; } r->idiag_state = asoc->state; if (timer_pending(t3_rtx)) { r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; r->idiag_retrans = asoc->rtx_data_chunks; r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); } } static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, struct list_head *address_list) { struct sctp_sockaddr_entry *laddr; int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct nlattr *attr; void *info = NULL; list_for_each_entry_rcu(laddr, address_list, list) addrcnt++; attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); if (!attr) return -EMSGSIZE; info = nla_data(attr); list_for_each_entry_rcu(laddr, address_list, list) { memcpy(info, &laddr->a, sizeof(laddr->a)); memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; } return 0; } static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); struct sctp_transport *from; struct nlattr *attr; void *info = NULL; attr = nla_reserve(skb, INET_DIAG_PEERS, addrlen * asoc->peer.transport_count); if (!attr) return -EMSGSIZE; info = nla_data(attr); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); memset(info + sizeof(from->ipaddr), 0, addrlen - sizeof(from->ipaddr)); info += addrlen; } return 0; } /* sctp asoc/ep fill*/ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, int portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh, bool net_admin) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct list_head *addr_list; struct inet_diag_msg *r; struct nlmsghdr *nlh; int ext = req->idiag_ext; struct sctp_infox infox; void *info = NULL; nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); r->idiag_timer = 0; r->idiag_retrans = 0; r->idiag_expires = 0; if (asoc) { inet_diag_msg_sctpasoc_fill(r, sk, asoc); } else { inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; } if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { u32 mem[SK_MEMINFO_VARS]; int amt; if (asoc && asoc->ep->sndbuf_policy) amt = asoc->sndbuf_used; else amt = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_WMEM_ALLOC] = amt; if (asoc && asoc->ep->rcvbuf_policy) amt = atomic_read(&asoc->rmem_alloc); else amt = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RMEM_ALLOC] = amt; mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; } if (ext & (1 << (INET_DIAG_INFO - 1))) { struct nlattr *attr; attr = nla_reserve_64bit(skb, INET_DIAG_INFO, sizeof(struct sctp_info), INET_DIAG_PAD); if (!attr) goto errout; info = nla_data(attr); } infox.sctpinfo = (struct sctp_info *)info; infox.asoc = asoc; sctp_diag_get_info(sk, r, &infox); addr_list = asoc ? &asoc->base.bind_addr.address_list : &ep->base.bind_addr.address_list; if (inet_diag_msg_sctpladdrs_fill(skb, addr_list)) goto errout; if (asoc && (ext & (1 << (INET_DIAG_CONG - 1)))) if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0) goto errout; if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc)) goto errout; nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* callback and param */ struct sctp_comm_param { struct sk_buff *skb; struct netlink_callback *cb; const struct inet_diag_req_v2 *r; const struct nlmsghdr *nlh; bool net_admin; }; static size_t inet_assoc_attr_size(struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct sctp_sockaddr_entry *laddr; list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, list) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) + nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64; } static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { struct sctp_association *assoc = tsp->asoc; struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *req = commp->r; struct sk_buff *skb = commp->skb; struct sk_buff *rep; int err; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) return err; rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); if (!rep) return -ENOMEM; lock_sock(sk); if (ep != assoc->ep) { err = -EAGAIN; goto out; } err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(skb).sk), NETLINK_CB(skb).portid, commp->nlh->nlmsg_seq, 0, commp->nlh, commp->net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto out; } release_sock(sk); return nlmsg_unicast(sock_net(skb->sk)->diag_nlsk, rep, NETLINK_CB(skb).portid); out: release_sock(sk); kfree_skb(rep); return err; } static int sctp_sock_dump(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; struct sctp_association *assoc; int err = 0; lock_sock(sk); if (ep != tsp->asoc->ep) goto release; list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) && r->id.idiag_sport) goto next; if (r->id.idiag_dport != htons(assoc->peer.port) && r->id.idiag_dport) goto next; if (!cb->args[3] && inet_sctp_diag_fill(sk, NULL, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, commp->net_admin) < 0) { err = 1; goto release; } cb->args[3] = 1; if (inet_sctp_diag_fill(sk, assoc, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, cb->nlh, commp->net_admin) < 0) { err = 1; goto release; } next: cb->args[4]++; } cb->args[1] = 0; cb->args[3] = 0; cb->args[4] = 0; release: release_sock(sk); return err; } static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; /* find the ep only once through the transports by this condition */ if (!list_is_first(&tsp->asoc->asocs, &ep->asocs)) return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) return 0; return 1; } static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) { struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; struct net *net = sock_net(skb->sk); struct inet_sock *inet = inet_sk(sk); int err = 0; if (!net_eq(sock_net(sk), net)) goto out; if (cb->args[4] < cb->args[1]) goto next; if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs)) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (inet_sctp_diag_fill(sk, NULL, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, commp->net_admin) < 0) { err = 2; goto out; } next: cb->args[4]++; out: return err; } /* define the functions for sctp_diag_handler*/ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { struct sctp_infox *infox = (struct sctp_infox *)info; if (infox->asoc) { r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); r->idiag_wqueue = infox->asoc->sndbuf_used; } else { r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } if (infox->sctpinfo) sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); } static int sctp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *skb = cb->skb; struct net *net = sock_net(skb->sk); const struct nlmsghdr *nlh = cb->nlh; union sctp_addr laddr, paddr; int dif = req->id.idiag_if; struct sctp_comm_param commp = { .skb = skb, .r = req, .nlh = nlh, .net_admin = netlink_net_capable(skb, CAP_NET_ADMIN), }; if (req->sdiag_family == AF_INET) { laddr.v4.sin_port = req->id.idiag_sport; laddr.v4.sin_addr.s_addr = req->id.idiag_src[0]; laddr.v4.sin_family = AF_INET; paddr.v4.sin_port = req->id.idiag_dport; paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0]; paddr.v4.sin_family = AF_INET; } else { laddr.v6.sin6_port = req->id.idiag_sport; memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, sizeof(laddr.v6.sin6_addr)); laddr.v6.sin6_family = AF_INET6; paddr.v6.sin6_port = req->id.idiag_dport; memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, sizeof(paddr.v6.sin6_addr)); paddr.v6.sin6_family = AF_INET6; } return sctp_transport_lookup_process(sctp_sock_dump_one, net, &laddr, &paddr, &commp, dif); } static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { u32 idiag_states = r->idiag_states; struct net *net = sock_net(skb->sk); struct sctp_comm_param commp = { .skb = skb, .cb = cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; int pos = cb->args[2]; /* eps hashtable dumps * args: * 0 : if it will traversal listen sock * 1 : to record the sock pos of this time's traversal * 4 : to work as a temporary variable to traversal list */ if (cb->args[0] == 0) { if (!(idiag_states & TCPF_LISTEN)) goto skip; if (sctp_for_each_endpoint(sctp_ep_dump, &commp)) goto done; skip: cb->args[0] = 1; cb->args[1] = 0; cb->args[4] = 0; } /* asocs by transport hashtable dump * args: * 1 : to record the assoc pos of this time's traversal * 2 : to record the transport pos of this time's traversal * 3 : to mark if we have dumped the ep info of the current asoc * 4 : to work as a temporary variable to traversal list * 5 : to save the sk we get from travelsing the tsp list. */ if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; sctp_transport_traverse_process(sctp_sock_filter, sctp_sock_dump, net, &pos, &commp); cb->args[2] = pos; done: cb->args[1] = cb->args[4]; cb->args[4] = 0; } static const struct inet_diag_handler sctp_diag_handler = { .owner = THIS_MODULE, .dump = sctp_diag_dump, .dump_one = sctp_diag_dump_one, .idiag_get_info = sctp_diag_get_info, .idiag_type = IPPROTO_SCTP, .idiag_info_size = sizeof(struct sctp_info), }; static int __init sctp_diag_init(void) { return inet_diag_register(&sctp_diag_handler); } static void __exit sctp_diag_exit(void) { inet_diag_unregister(&sctp_diag_handler); } module_init(sctp_diag_init); module_exit(sctp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SCTP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);
2 3 2 68 68 68 3 66 6 63 3 1 6 65 3 3 66 82 61 63 82 82 82 31 63 49 61 14 82 71 68 6 5 64 31 79 68 65 1 66 1 65 66 66 2 64 6 63 64 64 66 64 6 66 4 5 5 5 5 5 5 5 5 5 2 3 5 5 5 5 5 5 5 5 5 5 5 7 2 5 5 5 5 5 31 5 26 31 31 30 7 7 24 31 31 30 31 16 16 16 51 51 51 110 111 2 1 1 111 1 47 138 138 138 141 138 141 140 138 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 // SPDX-License-Identifier: GPL-2.0-or-later /* * inode.c * * vfs' aops, fops, dops and iops * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include <asm/byteorder.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dir.h" #include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "file.h" #include "heartbeat.h" #include "inode.h" #include "journal.h" #include "namei.h" #include "suballoc.h" #include "super.h" #include "symlink.h" #include "sysfile.h" #include "uptodate.h" #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" #include "filecheck.h" #include "buffer_head_io.h" struct ocfs2_find_inode_args { u64 fi_blkno; unsigned long fi_ino; unsigned int fi_flags; unsigned int fi_sysfile_type; }; static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); static int ocfs2_find_actor(struct inode *inode, void *opaque); static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags, int type); static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, struct buffer_head *bh); static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, struct buffer_head *bh); void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; inode->i_flags &= ~(S_IMMUTABLE | S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); if (flags & OCFS2_IMMUTABLE_FL) inode->i_flags |= S_IMMUTABLE; if (flags & OCFS2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & OCFS2_APPEND_FL) inode->i_flags |= S_APPEND; if (flags & OCFS2_NOATIME_FL) inode->i_flags |= S_NOATIME; if (flags & OCFS2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; } /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) { unsigned int flags = oi->vfs_inode.i_flags; oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); if (flags & S_SYNC) oi->ip_attr |= OCFS2_SYNC_FL; if (flags & S_APPEND) oi->ip_attr |= OCFS2_APPEND_FL; if (flags & S_IMMUTABLE) oi->ip_attr |= OCFS2_IMMUTABLE_FL; if (flags & S_NOATIME) oi->ip_attr |= OCFS2_NOATIME_FL; if (flags & S_DIRSYNC) oi->ip_attr |= OCFS2_DIRSYNC_FL; } struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) { struct ocfs2_find_inode_args args; args.fi_blkno = blkno; args.fi_flags = 0; args.fi_ino = ino_from_blkno(sb, blkno); args.fi_sysfile_type = 0; return ilookup5(sb, blkno, ocfs2_find_actor, &args); } struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); /* Ok. By now we've either got the offsets passed to us by the * caller, or we just pulled them off the bh. Lets do some * sanity checks to make sure they're OK. */ if (blkno == 0) { inode = ERR_PTR(-EINVAL); mlog_errno(PTR_ERR(inode)); goto bail; } args.fi_blkno = blkno; args.fi_flags = flags; args.fi_ino = ino_from_blkno(sb, blkno); args.fi_sysfile_type = sysfile_type; inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, ocfs2_init_locked_inode, &args); /* inode was *not* in the inode cache. 2.6.x requires * us to do our own read_inode call and unlock it * afterwards. */ if (inode == NULL) { inode = ERR_PTR(-ENOMEM); mlog_errno(PTR_ERR(inode)); goto bail; } trace_ocfs2_iget5_locked(inode->i_state); if (inode->i_state & I_NEW) { rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(rc); goto bail; } /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); oi->i_sync_tid = tid; oi->i_datasync_tid = tid; } bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, (unsigned long long)OCFS2_I(inode)->ip_blkno); } return inode; } /* * here's how inodes get read from disk: * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR * found? : return the in-memory inode * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE */ static int ocfs2_find_actor(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); int ret = 0; args = opaque; mlog_bug_on_msg(!inode, "No inode in find actor!\n"); trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); if (oi->ip_blkno != args->fi_blkno) goto bail; ret = 1; bail: return ret; } /* * initialize the new inode, but don't do anything that would cause * us to sleep. * return 0 on success, 1 on failure */ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; if (args->fi_sysfile_type != 0) lockdep_set_class(&inode->i_rwsem, &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE) lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_quota_ip_alloc_sem_key); else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); return 0; } void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, int create_ino) { struct super_block *sb; struct ocfs2_super *osb; int use_plocks = 1; sb = inode->i_sb; osb = OCFS2_SB(sb); if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) use_plocks = 0; /* * These have all been checked by ocfs2_read_inode_block() or set * by ocfs2_mknod_locked(), so a failure is a code bug. */ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode cannot create a superblock inode today. change if that is needed. */ BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); inode_set_iversion(inode, 1); inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); i_uid_write(inode, le32_to_cpu(fe->i_uid)); i_gid_write(inode, le32_to_cpu(fe->i_gid)); /* Fast symlinks will have i_size but no allocated clusters. */ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { inode->i_blocks = 0; inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; } else { inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; } inode_set_atime(inode, le64_to_cpu(fe->i_atime), le32_to_cpu(fe->i_atime_nsec)); inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), le32_to_cpu(fe->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, "ip_blkno %llu != i_blkno %llu!\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)le64_to_cpu(fe->i_blkno)); set_nlink(inode, ocfs2_read_links_count(fe)); trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, le32_to_cpu(fe->i_flags)); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; inode->i_flags |= S_NOQUOTA; } if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { inode->i_flags |= S_NOQUOTA; } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { /* we can't actually hit this as read_inode can't * handle superblocks today ;-) */ BUG(); } switch (inode->i_mode & S_IFMT) { case S_IFREG: if (use_plocks) inode->i_fop = &ocfs2_fops; else inode->i_fop = &ocfs2_fops_no_plocks; inode->i_op = &ocfs2_file_iops; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFDIR: inode->i_op = &ocfs2_dir_iops; if (use_plocks) inode->i_fop = &ocfs2_dops; else inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); OCFS2_I(inode)->ip_dir_lock_gen = 1; break; case S_IFLNK: inode->i_op = &ocfs2_symlink_inode_operations; inode_nohighmem(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: inode->i_op = &ocfs2_special_file_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); break; } if (create_ino) { inode->i_ino = ino_from_blkno(inode->i_sb, le64_to_cpu(fe->i_blkno)); /* * If we ever want to create system files from kernel, * the generation argument to * ocfs2_inode_lock_res_init() will have to change. */ BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, 0, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, OCFS2_LOCK_TYPE_OPEN, 0, inode); } ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, OCFS2_LOCK_TYPE_RW, inode->i_generation, inode); ocfs2_set_inode_flags(inode); OCFS2_I(inode)->ip_last_used_slot = 0; OCFS2_I(inode)->ip_last_used_group = 0; if (S_ISDIR(inode->i_mode)) ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, OCFS2_RESV_FLAG_DIR); } static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args) { struct super_block *sb; struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; sb = inode->i_sb; osb = OCFS2_SB(sb); /* * To improve performance of cold-cache inode stats, we take * the cluster lock here if possible. * * Generally, OCFS2 never trusts the contents of an inode * unless it's holding a cluster lock, so taking it here isn't * a correctness issue as much as it is a performance * improvement. * * There are three times when taking the lock is not a good idea: * * 1) During startup, before we have initialized the DLM. * * 2) If we are reading certain system files which never get * cluster locks (local alloc, truncate log). * * 3) If the process doing the iget() is responsible for * orphan dir recovery. We're holding the orphan dir lock and * can get into a deadlock with another process on another * node in ->delete_inode(). * * #1 and #2 can be simply solved by never taking the lock * here for system files (which are the only type we read * during mount). It's a heavier approach, but our main * concern is user-accessible files anyway. * * #3 works itself out because we'll eventually take the * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); trace_ocfs2_read_locked_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); /* * To maintain backwards compatibility with older versions of * ocfs2-tools, we still store the generation value for system * files. The only ones that actually matter to userspace are * the journals, but it's easier and inexpensive to just flag * all system files similarly. */ if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) generation = osb->fs_generation; ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, generation, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, OCFS2_LOCK_TYPE_OPEN, 0, inode); if (can_lock) { status = ocfs2_open_lock(inode); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } } if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { status = ocfs2_try_open_lock(inode, 0); if (status) { make_bad_inode(inode); return status; } } if (can_lock) { if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) status = ocfs2_filecheck_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE, 0); else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) status = ocfs2_filecheck_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE, 1); else status = ocfs2_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ if (!status && !buffer_jbd(bh)) { if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) status = ocfs2_filecheck_validate_inode_block( osb->sb, bh); else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) status = ocfs2_filecheck_repair_inode_block( osb->sb, bh); else status = ocfs2_validate_inode_block( osb->sb, bh); } } if (status < 0) { mlog_errno(status); goto bail; } status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; /* * This is a code bug. Right now the caller needs to * understand whether it is asking for a system file inode or * not so the proper lock names can be built. */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); ocfs2_populate_inode(inode, fe, 0); BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); if (buffer_dirty(bh) && !buffer_jbd(bh)) { if (can_lock) { ocfs2_inode_unlock(inode, lock_level); lock_level = 1; ocfs2_inode_lock(inode, NULL, lock_level); } status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); if (status < 0) { mlog_errno(status); goto bail; } } status = 0; bail: if (can_lock) ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); brelse(bh); return status; } void ocfs2_sync_blockdev(struct super_block *sb) { sync_blockdev(sb->s_bdev); } static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh) { int status = 0; struct ocfs2_dinode *fe; handle_t *handle = NULL; fe = (struct ocfs2_dinode *) fe_bh->b_data; /* * This check will also skip truncate of inodes with inline * data and fast symlinks. */ if (fe->i_clusters) { if (ocfs2_should_order_data(inode)) ocfs2_begin_ordered_truncate(inode, 0); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; mlog_errno(status); goto out; } status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out; } i_size_write(inode, 0); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { mlog_errno(status); goto out; } ocfs2_commit_trans(osb, handle); handle = NULL; status = ocfs2_commit_truncate(osb, inode, fe_bh); if (status < 0) mlog_errno(status); } out: if (handle) ocfs2_commit_trans(osb, handle); return status; } static int ocfs2_remove_inode(struct inode *inode, struct buffer_head *di_bh, struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh) { int status; struct inode *inode_alloc_inode = NULL; struct buffer_head *inode_alloc_bh = NULL; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; inode_alloc_inode = ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, le16_to_cpu(di->i_suballoc_slot)); if (!inode_alloc_inode) { status = -ENOENT; mlog_errno(status); goto bail; } inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; } handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + ocfs2_quota_trans_credits(inode->i_sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto bail_unlock; } if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; } } /* set the inodes dtime */ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail_commit; } di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); dquot_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); if (status < 0) mlog_errno(status); bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); return status; } /* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir * i_rwsem held, we'll deadlock here. Instead we detect this * and exit early - recovery will wipe this inode for us. */ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, int slot) { int ret = 0; spin_lock(&osb->osb_lock); if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { ret = -EDEADLK; goto out; } /* This signals to the orphan recovery process that it should * wait for us to handle the wipe. */ osb->osb_orphan_wipes[slot]++; out: spin_unlock(&osb->osb_lock); trace_ocfs2_check_orphan_recovery_state(slot, ret); return ret; } static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, int slot) { spin_lock(&osb->osb_lock); osb->osb_orphan_wipes[slot]--; spin_unlock(&osb->osb_lock); wake_up(&osb->osb_wipe_event); } static int ocfs2_wipe_inode(struct inode *inode, struct buffer_head *di_bh) { int status, orphaned_slot = -1; struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { orphaned_slot = le16_to_cpu(di->i_orphaned_slot); status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); if (status) return status; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); goto bail; } /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; } } /* we do this while holding the orphan dir lock because we * don't want recovery being run from another node to try an * inode delete underneath us -- this will result in two nodes * truncating the same file! */ status = ocfs2_truncate_for_delete(osb, inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_dir; } /* Remove any dir index tree */ if (S_ISDIR(inode->i_mode)) { status = ocfs2_dx_dir_truncate(inode, di_bh); if (status) { mlog_errno(status); goto bail_unlock_dir; } } /*Free extended attribute resources associated with this inode.*/ status = ocfs2_xattr_remove(inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_dir; } status = ocfs2_remove_refcount_tree(inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_dir; } status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) mlog_errno(status); bail_unlock_dir: if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) return status; ocfs2_inode_unlock(orphan_dir_inode, 1); inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); ocfs2_signal_wipe_completion(osb, orphaned_slot); return status; } /* There is a series of simple checks that should be done before a * trylock is even considered. Encapsulate those in this function. */ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) { int ret = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, (unsigned long long)oi->ip_blkno, oi->ip_flags); /* We shouldn't be getting here for the root directory * inode.. */ if (inode == osb->root_inode) { mlog(ML_ERROR, "Skipping delete of root inode.\n"); goto bail; } /* * If we're coming from downconvert_thread we can't go into our own * voting [hello, deadlock city!] so we cannot delete the inode. But * since we dropped last inode ref when downconverting dentry lock, * we cannot have the file open and thus the node doing unlink will * take care of deleting the inode. */ if (current == osb->dc_task) goto bail; spin_lock(&oi->ip_lock); /* OCFS2 *never* deletes system files. This should technically * never get here as system file inodes should always have a * positive link count. */ if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { mlog(ML_ERROR, "Skipping delete of system file %llu\n", (unsigned long long)oi->ip_blkno); goto bail_unlock; } ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); bail: return ret; } /* Query the cluster to determine whether we should wipe an inode from * disk or not. * * Requires the inode to have the cluster lock. */ static int ocfs2_query_inode_wipe(struct inode *inode, struct buffer_head *di_bh, int *wipe) { int status = 0, reason = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di; *wipe = 0; trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, inode->i_nlink); /* While we were waiting for the cluster lock in * ocfs2_delete_inode, another node might have asked to delete * the inode. Recheck our flags to catch this. */ if (!ocfs2_inode_is_valid_to_delete(inode)) { reason = 1; goto bail; } /* Now that we have an up to date inode, we can double check * the link count. */ if (inode->i_nlink) goto bail; /* Do some basic inode verification... */ di = (struct ocfs2_dinode *) di_bh->b_data; if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { /* * Inodes in the orphan dir must have ORPHANED_FL. The only * inodes that come back out of the orphan dir are reflink * targets. A reflink target may be moved out of the orphan * dir between the time we scan the directory and the time we * process it. This would lead to HAS_REFCOUNT_FL being set but * ORPHANED_FL not. */ if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { reason = 2; goto bail; } /* for lack of a better error? */ status = -EEXIST; mlog(ML_ERROR, "Inode %llu (on-disk %llu) not orphaned! " "Disk flags 0x%x, inode flags 0x%x\n", (unsigned long long)oi->ip_blkno, (unsigned long long)le64_to_cpu(di->i_blkno), le32_to_cpu(di->i_flags), oi->ip_flags); goto bail; } /* has someone already deleted us?! baaad... */ if (di->i_dtime) { status = -EEXIST; mlog_errno(status); goto bail; } /* * This is how ocfs2 determines whether an inode is still live * within the cluster. Every node takes a shared read lock on * the inode open lock in ocfs2_read_locked_inode(). When we * get to ->delete_inode(), each node tries to convert it's * lock to an exclusive. Trylocks are serialized by the inode * meta data lock. If the upconvert succeeds, we know the inode * is no longer live and can be deleted. * * Though we call this with the meta data lock held, the * trylock keeps us from ABBA deadlock. */ status = ocfs2_try_open_lock(inode, 1); if (status == -EAGAIN) { status = 0; reason = 3; goto bail; } if (status < 0) { mlog_errno(status); goto bail; } *wipe = 1; trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); bail: trace_ocfs2_query_inode_wipe_end(status, reason); return status; } /* Support function for ocfs2_delete_inode. Will help us keep the * inode data in a consistent state for clear_inode. Always truncates * pages, optionally sync's them first. */ static void ocfs2_cleanup_delete_inode(struct inode *inode, int sync_data) { trace_ocfs2_cleanup_delete_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) { int wipe, status; sigset_t oldset; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di = NULL; trace_ocfs2_delete_inode(inode->i_ino, (unsigned long long)OCFS2_I(inode)->ip_blkno, is_bad_inode(inode)); /* When we fail in read_inode() we mark inode as bad. The second test * catches the case when inode allocation fails before allocating * a block for inode. */ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most * likely be a no-op anyway) */ ocfs2_cleanup_delete_inode(inode, 0); goto bail; } dquot_initialize(inode); /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned * forever. */ ocfs2_block_signals(&oldset); /* * Synchronize us against ocfs2_get_dentry. We take this in * shared mode so that all nodes can still concurrently * process deletes. */ status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); if (status < 0) { mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); ocfs2_cleanup_delete_inode(inode, 0); goto bail_unblock; } /* Lock down the inode. This gives us an up to date view of * it's metadata (for verification), and allows us to * serialize delete_inode on multiple nodes. * * Even though we might be doing a truncate, we don't take the * allocation lock here as it won't be needed - nobody will * have the file open. */ status = ocfs2_inode_lock(inode, &di_bh, 1); if (status < 0) { if (status != -ENOENT) mlog_errno(status); ocfs2_cleanup_delete_inode(inode, 0); goto bail_unlock_nfs_sync; } di = (struct ocfs2_dinode *)di_bh->b_data; /* Skip inode deletion and wait for dio orphan entry recovered * first */ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ocfs2_cleanup_delete_inode(inode, 0); goto bail_unlock_inode; } /* Query the cluster. This will be the final decision made * before we go ahead and wipe the inode. */ status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); if (!wipe || status < 0) { /* Error and remote inode busy both mean we won't be * removing the inode, so they take almost the same * path. */ if (status < 0) mlog_errno(status); /* Someone in the cluster has disallowed a wipe of * this inode, or it was never completely * orphaned. Write out the pages and exit now. */ ocfs2_cleanup_delete_inode(inode, 1); goto bail_unlock_inode; } ocfs2_cleanup_delete_inode(inode, 0); status = ocfs2_wipe_inode(inode, di_bh); if (status < 0) { if (status != -EDEADLK) mlog_errno(status); goto bail_unlock_inode; } /* * Mark the inode as successfully deleted. * * This is important for ocfs2_clear_inode() as it will check * this flag and skip any checkpointing work * * ocfs2_stuff_meta_lvb() also uses this flag to invalidate * the LVB for other nodes. */ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; bail_unlock_inode: ocfs2_inode_unlock(inode, 1); brelse(di_bh); bail_unlock_nfs_sync: ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); bail_unblock: ocfs2_unblock_signals(&oldset); bail: return; } static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); mlog_bug_on_msg(osb == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); ocfs2_resv_discard(&osb->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ if (!(oi->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), "Clear inode of %llu, inode has unwritten extents\n", (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); status = ocfs2_drop_inode_locks(inode); if (status < 0) mlog_errno(status); ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_inode_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_exit(INODE_CACHE(inode)); mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, "Clear inode of %llu, inode has %u cache items\n", (unsigned long long)oi->ip_blkno, INODE_CACHE(inode)->ci_num_cached); mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), "Clear inode of %llu, inode has a bad flag\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), "Clear inode of %llu, inode is locked\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), "Clear inode of %llu, io_mutex is locked\n", (unsigned long long)oi->ip_blkno); mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 * kernel 1, world 0 */ mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), "Clear inode of %llu, alloc_sem is locked\n", (unsigned long long)oi->ip_blkno); up_write(&oi->ip_alloc_sem); mlog_bug_on_msg(oi->ip_open_count, "Clear inode of %llu has open count %d\n", (unsigned long long)oi->ip_blkno, oi->ip_open_count); /* Clear all other flags. */ oi->ip_flags = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; /* * ip_jinode is used to track txns against this inode. We ensure that * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } void ocfs2_evict_inode(struct inode *inode) { if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } /* Called under inode_lock, with no more references on the * struct inode, so it's safe here to check the flags field * and to manipulate i_nlink without any other locks. */ int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); assert_spin_locked(&inode->i_lock); inode->i_state |= I_WILL_FREE; spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; return 1; } /* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) { struct inode *inode = d_inode(dentry); int status = 0; trace_ocfs2_inode_revalidate(inode, inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); if (!inode) { status = -ENOENT; goto bail; } spin_lock(&OCFS2_I(inode)->ip_lock); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&OCFS2_I(inode)->ip_lock); status = -ENOENT; goto bail; } spin_unlock(&OCFS2_I(inode)->ip_lock); /* Let ocfs2_inode_lock do the work of updating our struct * inode for us. */ status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto bail; } ocfs2_inode_unlock(inode, 0); bail: return status; } /* * Updates a disk inode from a * struct inode. * Only takes ip_lock. */ int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int status; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; } spin_lock(&OCFS2_I(inode)->ip_lock); fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); ocfs2_get_inode_flags(OCFS2_I(inode)); fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); spin_unlock(&OCFS2_I(inode)->ip_lock); fe->i_size = cpu_to_le64(i_size_read(inode)); ocfs2_set_links_count(fe, inode->i_nlink); fe->i_uid = cpu_to_le32(i_uid_read(inode)); fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ocfs2_journal_dirty(handle, bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } /* * * Updates a struct inode from a disk inode. * does no i/o, only takes ip_lock. */ void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe) { spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); ocfs2_set_inode_flags(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); set_nlink(inode, ocfs2_read_links_count(fe)); i_uid_write(inode, le32_to_cpu(fe->i_uid)); i_gid_write(inode, le32_to_cpu(fe->i_gid)); inode->i_mode = le16_to_cpu(fe->i_mode); if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); inode_set_atime(inode, le64_to_cpu(fe->i_atime), le32_to_cpu(fe->i_atime_nsec)); inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), le32_to_cpu(fe->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } int ocfs2_validate_inode_block(struct super_block *sb, struct buffer_head *bh) { int rc; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * If the ecc fails, we return the error but otherwise * leave the filesystem running. We know any error is * local to this block. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); if (rc) { mlog(ML_ERROR, "Checksum failed for dinode %llu\n", (unsigned long long)bh->b_blocknr); goto bail; } /* * Errors after here are fatal. */ rc = -EINVAL; if (!OCFS2_IS_VALID_DINODE(di)) { rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", (unsigned long long)bh->b_blocknr, 7, di->i_signature); goto bail; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(di->i_blkno)); goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { rc = ocfs2_error(sb, "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", (unsigned long long)bh->b_blocknr); goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { rc = ocfs2_error(sb, "Invalid dinode #%llu: fs_generation is %u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); goto bail; } rc = 0; bail: return rc; } static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, struct buffer_head *bh) { int rc = 0; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; trace_ocfs2_filecheck_validate_inode_block( (unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * Call ocfs2_validate_meta_ecc() first since it has ecc repair * function, but we should not return error immediately when ecc * validation fails, because the reason is quite likely the invalid * inode number inputed. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); if (rc) { mlog(ML_ERROR, "Filecheck: checksum failed for dinode %llu\n", (unsigned long long)bh->b_blocknr); rc = -OCFS2_FILECHECK_ERR_BLOCKECC; } if (!OCFS2_IS_VALID_DINODE(di)) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: signature = %.*s\n", (unsigned long long)bh->b_blocknr, 7, di->i_signature); rc = -OCFS2_FILECHECK_ERR_INVALIDINO; goto bail; } else if (rc) goto bail; if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(di->i_blkno)); rc = -OCFS2_FILECHECK_ERR_BLOCKNO; goto bail; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " "not set\n", (unsigned long long)bh->b_blocknr); rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; goto bail; } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { mlog(ML_ERROR, "Filecheck: invalid dinode #%llu: fs_generation is %u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); rc = -OCFS2_FILECHECK_ERR_GENERATION; } bail: return rc; } static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, struct buffer_head *bh) { int changed = 0; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; if (!ocfs2_filecheck_validate_inode_block(sb, bh)) return 0; trace_ocfs2_filecheck_repair_inode_block( (unsigned long long)bh->b_blocknr); if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || ocfs2_is_soft_readonly(OCFS2_SB(sb))) { mlog(ML_ERROR, "Filecheck: cannot repair dinode #%llu " "on readonly filesystem\n", (unsigned long long)bh->b_blocknr); return -OCFS2_FILECHECK_ERR_READONLY; } if (buffer_jbd(bh)) { mlog(ML_ERROR, "Filecheck: cannot repair dinode #%llu, " "its buffer is in jbd\n", (unsigned long long)bh->b_blocknr); return -OCFS2_FILECHECK_ERR_INJBD; } if (!OCFS2_IS_VALID_DINODE(di)) { /* Cannot fix invalid inode block */ return -OCFS2_FILECHECK_ERR_INVALIDINO; } if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { /* Cannot just add VALID_FL flag back as a fix, * need more things to check here. */ return -OCFS2_FILECHECK_ERR_VALIDFLAG; } if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { di->i_blkno = cpu_to_le64(bh->b_blocknr); changed = 1; mlog(ML_ERROR, "Filecheck: reset dinode #%llu: i_blkno to %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(di->i_blkno)); } if (le32_to_cpu(di->i_fs_generation) != OCFS2_SB(sb)->fs_generation) { di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); changed = 1; mlog(ML_ERROR, "Filecheck: reset dinode #%llu: fs_generation to %u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); } if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); mark_buffer_dirty(bh); mlog(ML_ERROR, "Filecheck: reset dinode #%llu: compute meta ecc\n", (unsigned long long)bh->b_blocknr); } return 0; } static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags, int type) { int rc; struct buffer_head *tmp = *bh; if (!type) /* Check inode block */ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_filecheck_validate_inode_block); else /* Repair inode block */ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_filecheck_repair_inode_block); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; return rc; } int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { int rc; struct buffer_head *tmp = *bh; rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1, &tmp, flags, ocfs2_validate_inode_block); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; return rc; } int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) { return ocfs2_read_inode_block_full(inode, bh, 0); } static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); return oi->ip_blkno; } static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); return oi->vfs_inode.i_sb; } static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) __acquires(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); spin_lock(&oi->ip_lock); } static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) __releases(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); spin_unlock(&oi->ip_lock); } static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); mutex_lock(&oi->ip_io_mutex); } static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); mutex_unlock(&oi->ip_io_mutex); } const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { .co_owner = ocfs2_inode_cache_owner, .co_get_super = ocfs2_inode_cache_get_super, .co_cache_lock = ocfs2_inode_cache_lock, .co_cache_unlock = ocfs2_inode_cache_unlock, .co_io_lock = ocfs2_inode_cache_io_lock, .co_io_unlock = ocfs2_inode_cache_io_unlock, };
15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/auth.c * * Generic RPC client authentication API. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/types.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/hash.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/spinlock.h> #include <trace/events/sunrpc.h> #define RPC_CREDCACHE_DEFAULT_HASHBITS (4) struct rpc_cred_cache { struct hlist_head *hashtable; unsigned int hashbits; spinlock_t lock; }; static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, [RPC_AUTH_TLS] = (const struct rpc_authops __force __rcu *)&authtls_ops, }; static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), }; /* * Return the machine_cred pointer to be used whenever * the a generic machine credential is needed. */ const struct cred *rpc_machine_cred(void) { return &machine_cred; } EXPORT_SYMBOL_GPL(rpc_machine_cred); #define MAX_HASHTABLE_BITS (14) static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) { unsigned long num; unsigned int nbits; int ret; if (!val) goto out_inval; ret = kstrtoul(val, 0, &num); if (ret) goto out_inval; nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; return 0; out_inval: return -EINVAL; } static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) { unsigned int nbits; nbits = *(unsigned int *)kp->arg; return sprintf(buffer, "%u\n", 1U << nbits); } #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); static const struct kernel_param_ops param_ops_hashtbl_sz = { .set = param_set_hashtbl_sz, .get = param_get_hashtbl_sz, }; module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); static unsigned long auth_max_cred_cachesize = ULONG_MAX; module_param(auth_max_cred_cachesize, ulong, 0644); MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size"); static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor > RPC_AUTH_MAXFLAVOR) return RPC_AUTH_GSS; return flavor; } int rpcauth_register(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); if (old == NULL || old == ops) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); if (old == ops || old == NULL) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); static const struct rpc_authops * rpcauth_get_authops(rpc_authflavor_t flavor) { const struct rpc_authops *ops; if (flavor >= RPC_AUTH_MAXFLAVOR) return NULL; rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) { rcu_read_unlock(); request_module("rpc-auth-%u", flavor); rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) goto out; } if (!try_module_get(ops->owner)) ops = NULL; out: rcu_read_unlock(); return ops; } static void rpcauth_put_authops(const struct rpc_authops *ops) { module_put(ops->owner); } /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor * @info: a GSS mech OID, quality of protection, and service value * * Verifies that an appropriate kernel module is available or already loaded. * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is * not supported locally. */ rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; if (!ops) return RPC_AUTH_MAXFLAVOR; pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); /** * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor * @pseudoflavor: GSS pseudoflavor to match * @info: rpcsec_gss_info structure to fill in * * Returns zero and fills in "info" if pseudoflavor matches a * supported mechanism. */ int rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) { rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor); const struct rpc_authops *ops; int result; ops = rpcauth_get_authops(flavor); if (ops == NULL) return -ENOENT; result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); ops = rpcauth_get_authops(flavor); if (ops == NULL) goto out; auth = ops->create(args, clnt); rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) rpcauth_release(clnt->cl_auth); clnt->cl_auth = auth; out: return auth; } EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); /* * On success, the caller is responsible for freeing the reference * held by the hashtable */ static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; hlist_del_rcu(&cred->cr_hash); return true; } static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; bool ret; if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } /* * Initialize RPC credential cache */ int rpcauth_init_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *new; unsigned int hashsize; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out_nocache; new->hashbits = auth_hashbits; hashsize = 1U << new->hashbits; new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); if (!new->hashtable) goto out_nohashtbl; spin_lock_init(&new->lock); auth->au_credcache = new; return 0; out_nohashtbl: kfree(new); out_nocache: return -ENOMEM; } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); char * rpcauth_stringify_acceptor(struct rpc_cred *cred) { if (!cred->cr_ops->crstringify_acceptor) return NULL; return cred->cr_ops->crstringify_acceptor(cred); } EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor); /* * Destroy a list of credentials */ static inline void rpcauth_destroy_credlist(struct list_head *head) { struct rpc_cred *cred; while (!list_empty(head)) { cred = list_entry(head->next, struct rpc_cred, cr_lru); list_del_init(&cred->cr_lru); put_rpccred(cred); } } static void rpcauth_lru_add_locked(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; number_cred_unused++; list_add_tail(&cred->cr_lru, &cred_unused); } static void rpcauth_lru_add(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_add_locked(cred); spin_unlock(&rpc_credcache_lock); } static void rpcauth_lru_remove_locked(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; number_cred_unused--; list_del_init(&cred->cr_lru); } static void rpcauth_lru_remove(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_remove_locked(cred); spin_unlock(&rpc_credcache_lock); } /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. */ void rpcauth_clear_credcache(struct rpc_cred_cache *cache) { LIST_HEAD(free); struct hlist_head *head; struct rpc_cred *cred; unsigned int hashsize = 1U << cache->hashbits; int i; spin_lock(&rpc_credcache_lock); spin_lock(&cache->lock); for (i = 0; i < hashsize; i++) { head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); rpcauth_unhash_cred_locked(cred); /* Note: We now hold a reference to cred */ rpcauth_lru_remove_locked(cred); list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); } /* * Destroy the RPC credential cache */ void rpcauth_destroy_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *cache = auth->au_credcache; if (cache) { auth->au_credcache = NULL; rpcauth_clear_credcache(cache); kfree(cache->hashtable); kfree(cache); } } EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); #define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) /* * Remove stale credentials. Avoid sleeping inside the loop. */ static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { if (nr_to_scan-- == 0) break; if (refcount_read(&cred->cr_count) > 1) { rpcauth_lru_remove_locked(cred); continue; } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; rpcauth_lru_remove_locked(cred); freed++; list_add_tail(&cred->cr_lru, free); } return freed ? freed : SHRINK_STOP; } static unsigned long rpcauth_cache_do_shrink(int nr_to_scan) { LIST_HEAD(free); unsigned long freed; spin_lock(&rpc_credcache_lock); freed = rpcauth_prune_expired(&free, nr_to_scan); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); return freed; } /* * Run memory cache shrinker. */ static unsigned long rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; /* nothing left, don't come back */ if (list_empty(&cred_unused)) return SHRINK_STOP; return rpcauth_cache_do_shrink(sc->nr_to_scan); } static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return number_cred_unused * sysctl_vfs_cache_pressure / 100; } static void rpcauth_cache_enforce_limit(void) { unsigned long diff; unsigned int nr_to_scan; if (number_cred_unused <= auth_max_cred_cachesize) return; diff = number_cred_unused - auth_max_cred_cachesize; nr_to_scan = 100; if (diff < nr_to_scan) nr_to_scan = diff; rpcauth_cache_do_shrink(nr_to_scan); } /* * Look up a process' credentials in the authentication cache */ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, int flags, gfp_t gfp) { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; struct rpc_cred *cred = NULL, *entry, *new; unsigned int nr; nr = auth->au_ops->hash_cred(acred, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } rcu_read_unlock(); if (cred != NULL) goto found; new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; goto out; } spin_lock(&cache->lock); hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); rpcauth_cache_enforce_limit(); found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL && !(flags & RPCAUTH_LOOKUP_NEW)) { int res = cred->cr_ops->cr_init(auth, cred); if (res < 0) { put_rpccred(cred); cred = ERR_PTR(res); } } rpcauth_destroy_credlist(&free); out: return cred; } EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *auth, int flags) { struct auth_cred acred; struct rpc_cred *ret; const struct cred *cred = current_cred(); memset(&acred, 0, sizeof(acred)); acred.cred = cred; ret = auth->au_ops->lookup_cred(auth, &acred, flags); return ret; } EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, struct rpc_auth *auth, const struct rpc_credops *ops) { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_flags = 0; cred->cr_ops = ops; cred->cr_expire = jiffies; cred->cr_cred = get_cred(acred->cred); } EXPORT_SYMBOL_GPL(rpcauth_init_cred); static struct rpc_cred * rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = get_task_cred(&init_task), }; struct rpc_cred *ret; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); put_cred(acred.cred); return ret; } static struct rpc_cred * rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .principal = task->tk_client->cl_principal, .cred = init_task.cred, }; if (!acred.principal) return NULL; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } static struct rpc_cred * rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; return rpcauth_lookupcred(auth, lookupflags); } static int rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *new = NULL; int lookupflags = 0; struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = cred, }; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC; if (task->tk_op_cred) /* Task must use exactly this rpc_cred */ new = get_rpccred(task->tk_op_cred); else if (cred != NULL && cred != &machine_cred) new = auth->au_ops->lookup_cred(auth, &acred, lookupflags); else if (cred == &machine_cred) new = rpcauth_bind_machine_cred(task, lookupflags); /* If machine cred couldn't be bound, try a root cred */ if (new) ; else if (cred == &machine_cred) new = rpcauth_bind_root_cred(task, lookupflags); else if (flags & RPC_TASK_NULLCREDS) new = authnull_ops.lookup_cred(NULL, NULL, 0); else new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) return PTR_ERR(new); put_rpccred(req->rq_cred); req->rq_cred = new; return 0; } void put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; rcu_read_lock(); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; if (refcount_read(&cred->cr_count) != 1 || !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) goto out; if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { cred->cr_expire = jiffies; rpcauth_lru_add(cred); /* Race breaker */ if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) rpcauth_lru_remove(cred); } else if (rpcauth_unhash_cred(cred)) { rpcauth_lru_remove(cred); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; } out: rcu_read_unlock(); return; destroy: rcu_read_unlock(); cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); /** * rpcauth_marshcred - Append RPC credential to end of @xdr * @task: controlling RPC task * @xdr: xdr_stream containing initial portion of RPC Call header * * On success, an appropriate verifier is added to @xdr, @xdr is * updated to point past the verifier, and zero is returned. * Otherwise, @xdr is in an undefined state and a negative errno * is returned. */ int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crmarshal(task, xdr); } /** * rpcauth_wrap_req_encode - XDR encode the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message. * Otherwise, @xdr is in an undefined state. */ int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr) { kxdreproc_t encode = task->tk_msg.rpc_proc->p_encode; encode(task->tk_rqstp, xdr, task->tk_msg.rpc_argp); return 0; } EXPORT_SYMBOL_GPL(rpcauth_wrap_req_encode); /** * rpcauth_wrap_req - XDR encode and wrap the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message, * and zero is returned. Otherwise, @xdr is in an undefined * state and a negative errno is returned. */ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crwrap_req(task, xdr); } /** * rpcauth_checkverf - Validate verifier in RPC Reply header * @task: controlling RPC task * @xdr: xdr_stream containing RPC Reply header * * Return values: * %0: Verifier is valid. @xdr now points past the verifier. * %-EIO: Verifier is corrupted or message ended early. * %-EACCES: Verifier is intact but not valid. * %-EPROTONOSUPPORT: Server does not support the requested auth type. * * When a negative errno is returned, @xdr is left in an undefined * state. */ int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crvalidate(task, xdr); } /** * rpcauth_unwrap_resp_decode - Invoke XDR decode function * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr) { kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode; return decode(task->tk_rqstp, xdr, task->tk_msg.rpc_resp); } EXPORT_SYMBOL_GPL(rpcauth_unwrap_resp_decode); /** * rpcauth_unwrap_resp - Invoke unwrap and decode function for the cred * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crunwrap_resp(task, xdr); } bool rpcauth_xmit_need_reencode(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (!cred || !cred->cr_ops->crneed_reencode) return false; return cred->cr_ops->crneed_reencode(task); } int rpcauth_refreshcred(struct rpc_task *task) { struct rpc_cred *cred; int err; cred = task->tk_rqstp->rq_cred; if (cred == NULL) { err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags); if (err < 0) goto out; cred = task->tk_rqstp->rq_cred; } err = cred->cr_ops->crrefresh(task); out: if (err < 0) task->tk_status = err; return err; } void rpcauth_invalcred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (cred) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); } int rpcauth_uptodatecred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; return cred == NULL || test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { int err; err = rpc_init_authunix(); if (err < 0) goto out1; rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); if (!rpc_cred_shrinker) { err = -ENOMEM; goto out2; } rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; shrinker_register(rpc_cred_shrinker); return 0; out2: rpc_destroy_authunix(); out1: return err; } void rpcauth_remove_module(void) { rpc_destroy_authunix(); shrinker_free(rpc_cred_shrinker); }
117 13 117 6 114 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP cryptographic functions * Copyright (c) 2017 - 2019, Intel Corporation. * * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and * mptcp_ipv6 from multipath-tcp.org, authored by: * * Sébastien Barré <sebastien.barre@uclouvain.be> * Christoph Paasch <christoph.paasch@uclouvain.be> * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> * Gregory Detal <gregory.detal@uclouvain.be> * Fabien Duchêne <fabien.duchene@uclouvain.be> * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> * Lavkesh Lahngir <lavkesh51@gmail.com> * Andreas Ripke <ripke@neclab.eu> * Vlad Dogaru <vlad.dogaru@intel.com> * Octavian Purdila <octavian.purdila@intel.com> * John Ronan <jronan@tssg.org> * Catalin Nicutar <catalin.nicutar@gmail.com> * Brandon Heller <brandonh@stanford.edu> */ #include <linux/kernel.h> #include <crypto/sha2.h> #include <linux/unaligned.h> #include "protocol.h" #define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4) void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) { __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; __be64 input = cpu_to_be64(key); sha256((__force u8 *)&input, sizeof(input), (u8 *)mptcp_hashed_key); if (token) *token = be32_to_cpu(mptcp_hashed_key[0]); if (idsn) *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); } void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; u8 key1be[8]; u8 key2be[8]; int i; if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) len = SHA256_DIGEST_SIZE; put_unaligned_be64(key1, key1be); put_unaligned_be64(key2, key2be); /* Generate key xored with ipad */ memset(input, 0x36, SHA256_BLOCK_SIZE); for (i = 0; i < 8; i++) input[i] ^= key1be[i]; for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; memcpy(&input[SHA256_BLOCK_SIZE], msg, len); /* emit sha256(K1 || msg) on the second input block, so we can * reuse 'input' for the last hashing */ sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]); /* Prepare second part of hmac */ memset(input, 0x5C, SHA256_BLOCK_SIZE); for (i = 0; i < 8; i++) input[i] ^= key1be[i]; for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); } #if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif
268 269 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/mm.h> #include <asm/current.h> #include <asm/traps.h> #include <asm/vdso.h> struct vdso_exception_table_entry { int insn, fixup; }; bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct vdso_image *image = current->mm->context.vdso_image; const struct vdso_exception_table_entry *extable; unsigned int nr_entries, i; unsigned long base; /* * Do not attempt to fixup #DB or #BP. It's impossible to identify * whether or not a #DB/#BP originated from within an SGX enclave and * SGX enclaves are currently the only use case for vDSO fixup. */ if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) return false; if (!current->mm->context.vdso) return false; base = (unsigned long)current->mm->context.vdso + image->extable_base; nr_entries = image->extable_len / (sizeof(*extable)); extable = image->extable; for (i = 0; i < nr_entries; i++) { if (regs->ip == base + extable[i].insn) { regs->ip = base + extable[i].fixup; regs->di = trapnr; regs->si = error_code; regs->dx = fault_addr; return true; } } return false; }
1 30 5 8 1 3 16 1 1 8 6 1 5 3 3 5 5 8 4 5 5 5 5 5 4 3 16 10 7 16 7 8 8 1 3 2 2 10 1 7 1 8 8 8 8 15 15 15 15 25 24 1 3 1 2 1 2 8 11 2 9 9 27 2 11 11 1 16 6 10 18 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. * Written by Takashi Sato <t-sato@yk.jp.nec.com> * Akira Fujita <a-fujita@rs.jp.nec.com> */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "ext4_extents.h" /** * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched * @lblock: logical block number to find an extent path * @path: pointer to an extent path * * ext4_find_extent wrapper. Return an extent path pointer on success, * or an error pointer on failure. */ static inline struct ext4_ext_path * get_ext_path(struct inode *inode, ext4_lblk_t lblock, struct ext4_ext_path *path) { path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return path; if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); return ERR_PTR(-ENODATA); } return path; } /** * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem * @first: inode to be locked * @second: inode to be locked * * Acquire write lock of i_data_sem of the two inodes */ void ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } /** * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } /** * mext_check_coverage - Check that all extents in range has the same type * * @inode: inode in question * @from: block offset of inode * @count: block count to be checked * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; int ret = 0; ext4_lblk_t last = from + count; while (from < last) { path = get_ext_path(inode, from, path); if (IS_ERR(path)) { *err = PTR_ERR(path); return ret; } ext = path[ext_depth(inode)].p_ext; if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); } ret = 1; out: ext4_free_ext_path(path); return ret; } /** * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * @index1: folio index * @index2: folio index * @folio: result folio vector * * Grab two locked folio for inode's by inode order */ static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, pgoff_t index1, pgoff_t index2, struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { swap(index1, index2); mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if * BDI not demand that. But it is reasonable to be very conservative * here and explicitly wait on folio's writeback */ folio_wait_writeback(folio[0]); folio_wait_writeback(folio[1]); if (inode1 > inode2) swap(folio[0], folio[1]); return 0; } /* Force folio buffers uptodate w/o dropping folio's lock */ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) { struct inode *inode = folio->mapping->host; sector_t block; struct buffer_head *bh, *head; unsigned int blocksize, block_start, block_end; int nr = 0; bool partial = false; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (folio_test_uptodate(folio)) return 0; blocksize = i_blocksize(inode); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); block = folio_pos(folio) >> inode->i_blkbits; block_end = 0; bh = head; do { block_start = block_end; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = true; continue; } if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { int err = ext4_get_block(inode, block, bh, 0); if (err) return err; if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); continue; } ext4_read_bh_nowait(bh, 0, NULL, false); nr++; } while (block++, (bh = bh->b_this_page) != head); /* No io required */ if (!nr) goto out; bh = head; do { if (bh_offset(bh) + blocksize <= from) continue; if (bh_offset(bh) > to) break; wait_on_buffer(bh); if (buffer_uptodate(bh)) continue; return -EIO; } while ((bh = bh->b_this_page) != head); out: if (!partial) folio_mark_uptodate(folio); return 0; } /** * move_extent_per_page - Move extent data per page * * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, pgoff_t donor_page_offset, int data_offset_in_page, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ again: *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; } orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; donor_blk_offset = donor_page_offset * blocks_per_page + data_offset_in_page; /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ tmp_data_size = orig_inode->i_size & (blocksize - 1); /* * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ if (tmp_data_size == 0) tmp_data_size = blocksize; data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); } else data_size = block_len_in_page << orig_inode->i_blkbits; replaced_size = data_size; *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, donor_page_offset, folio); if (unlikely(*err < 0)) goto stop_journal; /* * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ unwritten = mext_check_coverage(orig_inode, orig_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_folios; } data_copy: *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ bh = folio_buffers(folio[0]); if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) goto repair_branches; bh = bh->b_this_page; } block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); unlock_folios: folio_unlock(folio[0]); folio_put(folio[0]); folio_unlock(folio[1]); folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; repair_branches: /* * This should never ever happen! * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), EIO, "Unable to copy data block," " data will be lost."); *err = -EIO; } replaced_count = 0; goto unlock_folios; } /** * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. * Return 0 on success, or a negative error value on failure. */ static int mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " "based file [ino:orig %lu]\n", orig_inode->i_ino); return -EOPNOTSUPP; } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: donor file is not extents " "based file [ino:donor %lu]\n", donor_inode->i_ino); return -EOPNOTSUPP; } if ((!orig_inode->i_size) || (!donor_inode->i_size)) { ext4_debug("ext4 move extent: File size is 0 byte\n"); return -EINVAL; } /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " "offsets are not aligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (orig_eof <= orig_start) *len = 0; else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; if (donor_eof <= donor_start) *len = 0; else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } return 0; } /** * ext4_move_extents - Exchange the specified range of a file * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file * @orig_blk: start offset in block for orig * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * */ int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; ext4_lblk_t o_end, o_start = orig_blk; ext4_lblk_t d_start = donor_blk; int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " "should be in same FS [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* orig and donor should be different inodes */ if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " "regular file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* TODO: it's not obvious how to swap blocks for inodes with full journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported with data journaling"); return -EOPNOTSUPP; } if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported for encrypted files"); return -EOPNOTSUPP; } /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, donor_blk, &len); if (ret) goto out; o_end = o_start + len; *moved_len = 0; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; pgoff_t orig_page_index, donor_page_index; int offset_in_page; int unwritten, cur_len; path = get_ext_path(orig_inode, o_start, path); if (IS_ERR(path)) { ret = PTR_ERR(path); goto out; } ex = path[path->p_depth].p_ext; cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; } d_start += next_blk - o_start; o_start = next_blk; continue; /* Check hole after the start pos */ } else if (cur_blk > o_start) { /* Skip hole */ d_start += cur_blk - o_start; o_start = cur_blk; /* Extent inside requested range ?*/ if (cur_blk >= o_end) goto out; } else { /* in_range(o_start, o_blk, o_len) */ cur_len += cur_blk - o_start; } unwritten = ext4_ext_is_unwritten(ex); if (o_end - o_start < cur_len) cur_len = o_end - o_start; orig_page_index = o_start >> (PAGE_SHIFT - orig_inode->i_blkbits); donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, * ->write_begin via pagefault, and jbd2_journal_commit * b. racing with ->read_folio, ->write_begin, and * ext4_get_block in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ *moved_len += move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; o_start += cur_len; d_start += cur_len; } out: if (*moved_len) { ext4_discard_preallocations(orig_inode); ext4_discard_preallocations(donor_inode); } ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); return ret; }
49 3069 178 4 5298 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMEKEEPING_H #define _LINUX_TIMEKEEPING_H #include <linux/errno.h> #include <linux/clocksource_ids.h> #include <linux/ktime.h> /* Included from linux/ktime.h */ void timekeeping_init(void); extern int timekeeping_suspended; /* Architecture timer tick functions: */ extern void legacy_timer_tick(unsigned long ticks); /* * Get and set timeofday */ extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. */ /* * timespec64 based interfaces */ extern void ktime_get_raw_ts64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); /* * time64_t base interfaces */ extern time64_t ktime_get_seconds(void); extern time64_t __ktime_get_real_seconds(void); extern time64_t ktime_get_real_seconds(void); /* * ktime_t based interfaces */ enum tk_offsets { TK_OFFS_REAL, TK_OFFS_BOOT, TK_OFFS_TAI, TK_OFFS_MAX, }; extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format * * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { return ktime_get_with_offset(TK_OFFS_REAL); } static inline ktime_t ktime_get_coarse_real(void) { return ktime_get_coarse_with_offset(TK_OFFS_REAL); } /** * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. * * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { return ktime_get_with_offset(TK_OFFS_BOOT); } static inline ktime_t ktime_get_coarse_boottime(void) { return ktime_get_coarse_with_offset(TK_OFFS_BOOT); } /** * ktime_get_clocktai - Get the TAI time of day in ktime_t format * * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { return ktime_get_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse_clocktai(void) { return ktime_get_coarse_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse(void) { struct timespec64 ts; ktime_get_coarse_ts64(&ts); return timespec64_to_ktime(ts); } static inline u64 ktime_get_coarse_ns(void) { return ktime_to_ns(ktime_get_coarse()); } static inline u64 ktime_get_coarse_real_ns(void) { return ktime_to_ns(ktime_get_coarse_real()); } static inline u64 ktime_get_coarse_boottime_ns(void) { return ktime_to_ns(ktime_get_coarse_boottime()); } static inline u64 ktime_get_coarse_clocktai_ns(void) { return ktime_to_ns(ktime_get_coarse_clocktai()); } /** * ktime_mono_to_real - Convert monotonic time to clock realtime * @mono: monotonic time to convert * * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } /** * ktime_get_ns - Get the current time in nanoseconds * * Returns: current time converted to nanoseconds */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } /** * ktime_get_real_ns - Get the current real/wall time in nanoseconds * * Returns: current real time converted to nanoseconds */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } /** * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds * * Returns: current boottime converted to nanoseconds */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } /** * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds * * Returns: current TAI time converted to nanoseconds */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } /** * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds * * Returns: current raw monotonic time converted to nanoseconds */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); } extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); extern u64 ktime_get_tai_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); /* * timespec64/time64_t interfaces utilizing the ktime based ones * for API completeness, these could be implemented more efficiently * if needed. */ static inline void ktime_get_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_boottime()); } static inline time64_t ktime_get_boottime_seconds(void) { return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC); } static inline void ktime_get_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); } static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_clocktai()); } static inline time64_t ktime_get_clocktai_seconds(void) { return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC); } /* * RTC specific */ extern bool timekeeping_rtc_skipsuspend(void); extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); /** * struct ktime_timestamps - Simultaneous mono/boot/real timestamps * @mono: Monotonic timestamp * @boot: Boottime timestamp * @real: Realtime timestamp */ struct ktime_timestamps { u64 mono; u64 boot; u64 real; }; /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { u64 cycles; ktime_t real; ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; }; /** * struct system_device_crosststamp - system/device cross-timestamp * (synchronized capture) * @device: Device time * @sys_realtime: Realtime simultaneous with device time * @sys_monoraw: Monotonic raw simultaneous with device time */ struct system_device_crosststamp { ktime_t device; ktime_t sys_realtime; ktime_t sys_monoraw; }; /** * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource * @cycles: System counter value * @cs_id: Clocksource ID corresponding to system counter value. Used by * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. * @use_nsecs: @cycles is in nanoseconds. */ struct system_counterval_t { u64 cycles; enum clocksource_ids cs_id; bool use_nsecs; }; extern bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles); extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); /* * Get cross timestamp between system clock and device clock */ extern int get_device_system_crosststamp( int (*get_time_fn)(ktime_t *device_time, struct system_counterval_t *system_counterval, void *ctx), void *ctx, struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* * Simultaneously snapshot realtime and monotonic raw clocks */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); /* NMI safe mono/boot/realtime timestamps */ extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap); /* * Persistent clock related interfaces */ extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, struct timespec64 *boot_offset); #ifdef CONFIG_GENERIC_CMOS_UPDATE extern int update_persistent_clock64(struct timespec64 now); #endif #endif
67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** * batadv_softif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; kref_put(&vlan->refcount, batadv_softif_vlan_release); } #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 2 2 2 2 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_gc.h" #include "btree_write_buffer.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "disk_accounting.h" #include "ec.h" #include "error.h" #include "lru.h" #include "recovery.h" #include "trace.h" #include "varint.h" #include <linux/kthread.h> #include <linux/math64.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/sched/task.h> #include <linux/sort.h> #include <linux/jiffies.h> static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, BCH_ALLOC_FIELDS_V1() #undef x }; struct bkey_alloc_unpacked { u64 journal_seq; u8 gen; u8 oldest_gen; u8 data_type; bool need_discard:1; bool need_inc_gen:1; #define x(_name, _bits) u##_bits _name; BCH_ALLOC_FIELDS_V2() #undef x }; static inline u64 alloc_field_v1_get(const struct bch_alloc *a, const void **p, unsigned field) { unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; u64 v; if (!(a->fields & (1 << field))) return 0; switch (bytes) { case 1: v = *((const u8 *) *p); break; case 2: v = le16_to_cpup(*p); break; case 4: v = le32_to_cpup(*p); break; case 8: v = le64_to_cpup(*p); break; default: BUG(); } *p += bytes; return v; } static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; const void *d = in->data; unsigned idx = 0; out->gen = in->gen; #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); BCH_ALLOC_FIELDS_V1() #undef x } static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); const u8 *in = a.v->data; const u8 *end = bkey_val_end(a); unsigned fieldnr = 0; int ret; u64 v; out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; #define x(_name, _bits) \ if (fieldnr < a.v->nr_fields) { \ ret = bch2_varint_decode_fast(in, end, &v); \ if (ret < 0) \ return ret; \ in += ret; \ } else { \ v = 0; \ } \ out->_name = v; \ if (v != out->_name) \ return -1; \ fieldnr++; BCH_ALLOC_FIELDS_V2() #undef x return 0; } static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); const u8 *in = a.v->data; const u8 *end = bkey_val_end(a); unsigned fieldnr = 0; int ret; u64 v; out->gen = a.v->gen; out->oldest_gen = a.v->oldest_gen; out->data_type = a.v->data_type; out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); out->journal_seq = le64_to_cpu(a.v->journal_seq); #define x(_name, _bits) \ if (fieldnr < a.v->nr_fields) { \ ret = bch2_varint_decode_fast(in, end, &v); \ if (ret < 0) \ return ret; \ in += ret; \ } else { \ v = 0; \ } \ out->_name = v; \ if (v != out->_name) \ return -1; \ fieldnr++; BCH_ALLOC_FIELDS_V2() #undef x return 0; } static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) { struct bkey_alloc_unpacked ret = { .gen = 0 }; switch (k.k->type) { case KEY_TYPE_alloc: bch2_alloc_unpack_v1(&ret, k); break; case KEY_TYPE_alloc_v2: bch2_alloc_unpack_v2(&ret, k); break; case KEY_TYPE_alloc_v3: bch2_alloc_unpack_v3(&ret, k); break; } return ret; } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) if (a->fields & (1 << i)) bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; /* allow for unknown fields */ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, alloc_v1_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); fsck_err: return ret; } int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bch_alloc_v4 a; int ret = 0; bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && BCH_ALLOC_V4_NR_BACKPOINTERS(&a), c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", a.data_type, alloc_data_type(a, a.data_type)); for (unsigned i = 0; i < 2; i++) bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, c, alloc_key_io_time_bad, "invalid io_time[%s]: %llu, max %llu", i == READ ? "read" : "write", a.io_time[i], LRU_TIME_MAX); unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > offsetof(struct bch_alloc_v4, stripe_sectors) ? a.stripe_sectors : 0; switch (a.data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: bkey_fsck_err_on(stripe_sectors || a.dirty_sectors || a.cached_sectors || a.stripe, c, alloc_key_empty_but_have_data, "empty data type free but have data %u.%u.%u %u", stripe_sectors, a.dirty_sectors, a.cached_sectors, a.stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: bkey_fsck_err_on(!a.dirty_sectors && !stripe_sectors, c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_type_str(a.data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.cached_sectors || a.dirty_sectors || stripe_sectors || a.stripe, c, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: break; } fsck_err: return ret; } void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; struct bch_backpointer *bp, *bps; a->journal_seq = swab64(a->journal_seq); a->flags = swab32(a->flags); a->dirty_sectors = swab32(a->dirty_sectors); a->cached_sectors = swab32(a->cached_sectors); a->io_time[0] = swab64(a->io_time[0]); a->io_time[1] = swab64(a->io_time[1]); a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); a->stripe_sectors = swab32(a->stripe_sectors); bps = alloc_v4_backpointers(a); for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { bp->bucket_offset = swab40(bp->bucket_offset); bp->bucket_len = swab32(bp->bucket_len); bch2_bpos_swab(&bp->pos); } } void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; prt_newline(out); printbuf_indent_add(out, 2); prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu\n", a->journal_seq); prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); prt_printf(out, "cached_sectors %u\n", a->cached_sectors); prt_printf(out, "stripe %u\n", a->stripe); prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); if (ca) prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); printbuf_indent_sub(out, 2); bch2_dev_put(ca); } void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { void *src, *dst; *out = *bkey_s_c_to_alloc_v4(k).v; src = alloc_v4_backpointers(out); SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(out); if (src < dst) memset(src, 0, dst - src); SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); } else { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); *out = (struct bch_alloc_v4) { .journal_seq = u.journal_seq, .flags = u.need_discard, .gen = u.gen, .oldest_gen = u.oldest_gen, .data_type = u.data_type, .stripe_redundancy = u.stripe_redundancy, .dirty_sectors = u.dirty_sectors, .cached_sectors = u.cached_sectors, .io_time[READ] = u.read_time, .io_time[WRITE] = u.write_time, .stripe = u.stripe, }; SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); } } static noinline struct bkey_i_alloc_v4 * __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i_alloc_v4 *ret; ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); if (IS_ERR(ret)) return ret; if (k.k->type == KEY_TYPE_alloc_v4) { void *src, *dst; bkey_reassemble(&ret->k_i, k); src = alloc_v4_backpointers(&ret->v); SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(&ret->v); if (src < dst) memset(src, 0, dst - src); SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); set_alloc_v4_u64s(ret); } else { bkey_alloc_v4_init(&ret->k_i); ret->k.p = k.k->p; bch2_alloc_to_v4(k, &ret->v); } return ret; } static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_s_c_alloc_v4 a; if (likely(k.k->type == KEY_TYPE_alloc_v4) && ((a = bkey_s_c_to_alloc_v4(k), true) && BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); return __bch2_alloc_to_v4_mut(trans, k); } struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { return bch2_alloc_to_v4_mut_inlined(trans, k); } struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) { struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, BTREE_ITER_with_updates| BTREE_ITER_cached| BTREE_ITER_intent); int ret = bkey_err(k); if (unlikely(ret)) return ERR_PTR(ret); struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); if (unlikely(ret)) goto err; return a; err: bch2_trans_iter_exit(trans, iter); return ERR_PTR(ret); } __flatten struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); int ret = PTR_ERR_OR_ZERO(a); if (ret) return ERR_PTR(ret); ret = bch2_trans_update(trans, &iter, &a->k_i, flags); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; } static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; return pos; } static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) { pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; pos.offset += offset; return pos; } static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) { return k.k->type == KEY_TYPE_bucket_gens ? bkey_s_c_to_bucket_gens(k).v->gens[offset] : 0; } int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { int ret = 0; bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, bucket_gens_val_size_bad, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); fsck_err: return ret; } void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); unsigned i; for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { if (i) prt_char(out, ' '); prt_printf(out, "%u", g.v->gens[i]); } } int bch2_bucket_gens_init(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bkey_i_bucket_gens g; bool have_bucket_gens_key = false; int ret; ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ if (!bch2_dev_bucket_exists(c, k.k->p)) continue; struct bch_alloc_v4 a; u8 gen = bch2_alloc_to_v4(k, &a)->gen; unsigned offset; struct bpos pos = alloc_gens_pos(iter.pos, &offset); int ret2 = 0; if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret2) goto iter_err; have_bucket_gens_key = false; } if (!have_bucket_gens_key) { bkey_bucket_gens_init(&g.k_i); g.k.p = pos; have_bucket_gens_key = true; } g.v.gens[offset] = gen; iter_err: ret2; })); if (have_bucket_gens_key && !ret) ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bch_dev *ca = NULL; int ret; if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_prefetch, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ if (!ca) { bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; 0; })); } else { ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ if (!ca) { bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } if (k.k->p.offset < ca->mi.first_bucket) { bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); continue; } if (k.k->p.offset >= ca->mi.nbuckets) { bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; } struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 0; })); } bch2_dev_put(ca); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } /* Free space/discard btree: */ static int bch2_bucket_do_index(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c old; struct bkey_i *k; enum btree_id btree; enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; struct printbuf buf = PRINTBUF; int ret; if (a->data_type != BCH_DATA_free && a->data_type != BCH_DATA_need_discard) return 0; k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); if (IS_ERR(k)) return PTR_ERR(k); bkey_init(&k->k); k->k.type = new_type; switch (a->data_type) { case BCH_DATA_free: btree = BTREE_ID_freespace; k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); bch2_key_resize(&k->k, 1); break; case BCH_DATA_need_discard: btree = BTREE_ID_need_discard; k->k.p = alloc_k.k->p; break; default: return 0; } old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), BTREE_ITER_intent); ret = bkey_err(old); if (ret) return ret; if (ca->mi.freespace_initialized && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && bch2_trans_inconsistent_on(old.k->type != old_type, trans, "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" " for %s", set ? "setting" : "clearing", bch2_btree_id_str(btree), iter.pos.inode, iter.pos.offset, bch2_bkey_types[old.k->type], bch2_bkey_types[old_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ret = -EIO; goto err; } ret = bch2_trans_update(trans, &iter, k, 0); err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } static noinline int bch2_bucket_gen_update(struct btree_trans *trans, struct bpos bucket, u8 gen) { struct btree_iter iter; unsigned offset; struct bpos pos = alloc_gens_pos(bucket, &offset); struct bkey_i_bucket_gens *g; struct bkey_s_c k; int ret; g = bch2_trans_kmalloc(trans, sizeof(*g)); ret = PTR_ERR_OR_ZERO(g); if (ret) return ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, BTREE_ITER_intent| BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) return ret; if (k.k->type != KEY_TYPE_bucket_gens) { bkey_bucket_gens_init(&g->k_i); g->k.p = iter.pos; } else { bkey_reassemble(&g->k_i, k); } g->v.gens[offset] = gen; ret = bch2_trans_update(trans, &iter, &g->k_i, 0); bch2_trans_iter_exit(trans, &iter); return ret; } static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, enum bch_data_type data_type, s64 delta_buckets, s64 delta_sectors, s64 delta_fragmented, unsigned flags) { struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_dev_data_type, .dev_data_type.dev = ca->dev_idx, .dev_data_type.data_type = data_type, }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, const struct bch_alloc_v4 *old, const struct bch_alloc_v4 *new, unsigned flags) { s64 old_sectors = bch2_bucket_sectors(*old); s64 new_sectors = bch2_bucket_sectors(*new); if (old->data_type != new->data_type) { int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); if (ret) return ret; } else if (old_sectors != new_sectors) { int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, 0, new_sectors - old_sectors, bch2_bucket_sectors_fragmented(ca, *new) - bch2_bucket_sectors_fragmented(ca, *old), flags); if (ret) return ret; } s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); if (old_unstriped != new_unstriped) { int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, !!new_unstriped - !!old_unstriped, new_unstriped - old_unstriped, 0, flags); if (ret) return ret; } return 0; } int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) return -EIO; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); struct bch_alloc_v4 *new_a; if (likely(new.k->type == KEY_TYPE_alloc_v4)) { new_a = bkey_s_to_alloc_v4(new).v; } else { BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); ret = PTR_ERR_OR_ZERO(new_ka); if (unlikely(ret)) goto err; new_a = &new_ka->v; } if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { new_a->io_time[READ] = bch2_current_io_time(c, READ); new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } if (data_type_is_empty(new_a->data_type) && BCH_ALLOC_V4_NEED_INC_GEN(new_a) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); if (ret) goto err; } if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); if (old_lru != new_lru) { ret = bch2_lru_change(trans, new.k->p.inode, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) goto err; } old_lru = alloc_lru_idx_fragmentation(*old_a, ca); new_lru = alloc_lru_idx_fragmentation(*new_a, ca); if (old_lru != new_lru) { ret = bch2_lru_change(trans, BCH_LRU_FRAGMENTATION_START, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) goto err; } if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) goto err; } if ((flags & BTREE_TRIGGER_bucket_invalidate) && old_a->cached_sectors) { ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, -((s64) old_a->cached_sectors), flags & BTREE_TRIGGER_gc); if (ret) goto err; } ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); if (ret) goto err; } if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; if ((flags & BTREE_TRIGGER_insert) && data_type_is_empty(old_a->data_type) != data_type_is_empty(new_a->data_type) && new.k->type == KEY_TYPE_alloc_v4) { struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; /* * If the btree updates referring to a bucket weren't flushed * before the bucket became empty again, then the we don't have * to wait on a journal flush before we can reuse the bucket: */ v->journal_seq = bucket_journal_seq = data_type_is_empty(new_a->data_type) && (journal_seq == v->journal_seq || bch2_journal_noflush_seq(&c->journal, v->journal_seq)) ? 0 : journal_seq; } if (!data_type_is_empty(old_a->data_type) && data_type_is_empty(new_a->data_type) && bucket_journal_seq) { ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, new.k->p.inode, new.k->p.offset, bucket_journal_seq); if (bch2_fs_fatal_err_on(ret, c, "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) goto err; } if (new_a->gen != old_a->gen) { rcu_read_lock(); u8 *gen = bucket_gen(ca, new.k->p.offset); if (unlikely(!gen)) { rcu_read_unlock(); goto invalid_bucket; } *gen = new_a->gen; rcu_read_unlock(); } #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) if (statechange(a->data_type == BCH_DATA_free) && bucket_flushed(new_a)) closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) bch2_discard_one_bucket_fast(ca, new.k->p.offset); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) bch2_dev_do_invalidates(ca); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_gc_gens_async(c); } if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { rcu_read_lock(); struct bucket *g = gc_bucket(ca, new.k->p.offset); if (unlikely(!g)) { rcu_read_unlock(); goto invalid_bucket; } g->gen_valid = 1; g->gen = new_a->gen; rcu_read_unlock(); } err: printbuf_exit(&buf); bch2_dev_put(ca); return ret; invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); ret = -EIO; goto err; } /* * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) return k; if (k.k->type) { return k; } else { struct btree_iter iter2; struct bpos next; bch2_trans_copy_iter(&iter2, iter); struct btree_path *path = btree_iter_path(iter->trans, iter); if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); /* * btree node min/max is a closed interval, upto takes a half * open interval: */ k = bch2_btree_iter_peek_upto(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); BUG_ON(next.offset >= iter->pos.offset + U32_MAX); if (bkey_err(k)) return k; bkey_init(hole); hole->p = iter->pos; bch2_key_resize(hole, next.offset - iter->pos.offset); return (struct bkey_s_c) { hole, NULL }; } } static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) { if (*ca) { if (bucket->offset < (*ca)->mi.first_bucket) bucket->offset = (*ca)->mi.first_bucket; if (bucket->offset < (*ca)->mi.nbuckets) return true; bch2_dev_put(*ca); *ca = NULL; bucket->inode++; bucket->offset = 0; } rcu_read_lock(); *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (*ca) { *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); bch2_dev_get(*ca); } rcu_read_unlock(); return *ca != NULL; } static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bch_dev **ca, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; again: k = bch2_get_key_or_hole(iter, POS_MAX, hole); if (bkey_err(k)) return k; *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); if (!k.k->type) { struct bpos hole_start = bkey_start_pos(k.k); if (!*ca || !bucket_valid(*ca, hole_start.offset)) { if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; bch2_btree_iter_set_pos(iter, hole_start); goto again; } if (k.k->p.offset > (*ca)->mi.nbuckets) bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); } return k; } static noinline_for_stack int bch2_check_alloc_key(struct btree_trans *trans, struct bkey_s_c alloc_k, struct btree_iter *alloc_iter, struct btree_iter *discard_iter, struct btree_iter *freespace_iter, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret = 0; struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); if (fsck_err_on(!ca, trans, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) ret = bch2_btree_delete_at(trans, alloc_iter, 0); if (!ca) return ret; if (!ca->mi.freespace_initialized) goto out; a = bch2_alloc_to_v4(alloc_k, &a_convert); discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(k.k->type != discard_key_type, trans, need_discard_key_wrong, "incorrect key in need_discard btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[discard_key_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); ret = PTR_ERR_OR_ZERO(update); if (ret) goto err; bkey_init(&update->k); update->k.type = discard_key_type; update->k.p = discard_iter->pos; ret = bch2_trans_update(trans, discard_iter, update, 0); if (ret) goto err; } freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(k.k->type != freespace_key_type, trans, freespace_key_wrong, "incorrect key in freespace btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[freespace_key_type], (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); ret = PTR_ERR_OR_ZERO(update); if (ret) goto err; bkey_init(&update->k); update->k.type = freespace_key_type; update->k.p = freespace_iter->pos; bch2_key_resize(&update->k, 1); ret = bch2_trans_update(trans, freespace_iter, update, 0); if (ret) goto err; } bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), trans, bucket_gens_key_wrong, "incorrect gen in bucket_gens btree (got %u should be %u)\n" " %s", alloc_gen(k, gens_offset), a->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); ret = PTR_ERR_OR_ZERO(g); if (ret) goto err; if (k.k->type == KEY_TYPE_bucket_gens) { bkey_reassemble(&g->k_i, k); } else { bkey_bucket_gens_init(&g->k_i); g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); } g->v.gens[gens_offset] = a->gen; ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); if (ret) goto err; } out: err: fsck_err: bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_freespace(struct btree_trans *trans, struct bch_dev *ca, struct bpos start, struct bpos *end, struct btree_iter *freespace_iter) { struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; if (!ca->mi.freespace_initialized) return 0; bch2_btree_iter_set_pos(freespace_iter, start); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; *end = bkey_min(k.k->p, *end); if (fsck_err_on(k.k->type != KEY_TYPE_set, trans, freespace_hole_missing, "hole in alloc btree missing in freespace btree\n" " device %llu buckets %llu-%llu", freespace_iter->pos.inode, freespace_iter->pos.offset, end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); ret = PTR_ERR_OR_ZERO(update); if (ret) goto err; bkey_init(&update->k); update->k.type = KEY_TYPE_set; update->k.p = freespace_iter->pos; bch2_key_resize(&update->k, min_t(u64, U32_MAX, end->offset - freespace_iter->pos.offset)); ret = bch2_trans_update(trans, freespace_iter, update, 0); if (ret) goto err; } err: fsck_err: printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, struct bpos start, struct bpos *end, struct btree_iter *bucket_gens_iter) { struct bkey_s_c k; struct printbuf buf = PRINTBUF; unsigned i, gens_offset, gens_end_offset; int ret; bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; if (bkey_cmp(alloc_gens_pos(start, &gens_offset), alloc_gens_pos(*end, &gens_end_offset))) gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; if (k.k->type == KEY_TYPE_bucket_gens) { struct bkey_i_bucket_gens g; bool need_update = false; bkey_reassemble(&g.k_i, k); for (i = gens_offset; i < gens_end_offset; i++) { if (fsck_err_on(g.v.gens[i], trans, bucket_gens_hole_wrong, "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", bucket_gens_pos_to_alloc(k.k->p, i).inode, bucket_gens_pos_to_alloc(k.k->p, i).offset, g.v.gens[i])) { g.v.gens[i] = 0; need_update = true; } } if (need_update) { struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; memcpy(u, &g, sizeof(g)); ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); if (ret) goto err; } } *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); err: fsck_err: printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter; struct bkey_s_c alloc_k; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; u64 genbits; struct bpos pos; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ? BCH_DATA_need_discard : BCH_DATA_free; struct printbuf buf = PRINTBUF; int ret; pos = iter->pos; pos.offset &= ~(~0ULL << 56); genbits = iter->pos.offset & (~0ULL << 56); alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); ret = bkey_err(alloc_k); if (ret) return ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), trans, need_discard_freespace_key_to_invalid_dev_bucket, "entry in %s btree for nonexistant dev:bucket %llu:%llu", bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) goto delete; a = bch2_alloc_to_v4(alloc_k, &a_convert); if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a)), trans, need_discard_freespace_key_bad, "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), iter->pos.inode, iter->pos.offset, a->data_type == state, genbits >> 56, alloc_freespace_genbits(*a) >> 56)) goto delete; out: fsck_err: bch2_set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; delete: ret = bch2_btree_delete_extent_at(trans, iter, iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); goto out; } /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent * buckets. */ static noinline_for_stack int bch2_check_bucket_gens_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; bool need_update = false; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); if (!ca) { if (fsck_err(trans, bucket_gens_to_invalid_dev, "bucket_gens key for invalid device:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); goto out; } if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, trans, bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto out; } for (b = start; b < ca->mi.first_bucket; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], trans, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } for (b = ca->mi.nbuckets; b < end; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], trans, bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; } if (need_update) { struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); ret = PTR_ERR_OR_ZERO(u); if (ret) goto out; memcpy(u, &g, sizeof(g)); ret = bch2_trans_update(trans, iter, u, 0); } out: fsck_err: bch2_dev_put(ca); printbuf_exit(&buf); return ret; } int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; struct bch_dev *ca = NULL; struct bkey hole; struct bkey_s_c k; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_prefetch); while (1) { struct bpos next; bch2_trans_begin(trans); k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; if (!k.k) break; if (k.k->type) { next = bpos_nosnap_successor(k.k->p); ret = bch2_check_alloc_key(trans, k, &iter, &discard_iter, &freespace_iter, &bucket_gens_iter); if (ret) goto bkey_err; } else { next = k.k->p; ret = bch2_check_alloc_hole_freespace(trans, ca, bkey_start_pos(k.k), &next, &freespace_iter) ?: bch2_check_alloc_hole_bucket_gens(trans, bkey_start_pos(k.k), &next, &bucket_gens_iter); if (ret) goto bkey_err; } ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; bch2_btree_iter_set_pos(&iter, next); bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; } bch2_trans_iter_exit(trans, &bucket_gens_iter); bch2_trans_iter_exit(trans, &freespace_iter); bch2_trans_iter_exit(trans, &discard_iter); bch2_trans_iter_exit(trans, &iter); bch2_dev_put(ca); ca = NULL; if (ret < 0) goto err; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_prefetch, k, bch2_check_discard_freespace_key(trans, &iter)); if (ret) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); if (!k.k) break; ret = bkey_err(k) ?: bch2_check_discard_freespace_key(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } if (ret) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); bch_err(c, "while checking %s", buf.buf); printbuf_exit(&buf); break; } bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); } bch2_trans_iter_exit(trans, &iter); if (ret) goto err; ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct btree_iter *alloc_iter, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret; alloc_k = bch2_btree_iter_peek(alloc_iter); if (!alloc_k.k) return 0; ret = bkey_err(alloc_k); if (ret) return ret; struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); if (!ca) return 0; a = bch2_alloc_to_v4(alloc_k, &a_convert); u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, lru_idx, alloc_k, last_flushed); if (ret) goto err; } if (a->data_type != BCH_DATA_cached) goto err; if (fsck_err_on(!a->io_time[READ], trans, alloc_key_cached_but_read_time_zero, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_alloc_v4 *a_mut = bch2_alloc_to_v4_mut(trans, alloc_k); ret = PTR_ERR_OR_ZERO(a_mut); if (ret) goto err; a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) goto err; a = &a_mut->v; } ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], alloc_k, last_flushed); if (ret) goto err; err: fsck_err: bch2_dev_put(ca); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { int ret; mutex_lock(&ca->discard_buckets_in_flight_lock); darray_for_each(ca->discard_buckets_in_flight, i) if (i->bucket == bucket) { ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { .in_progress = in_progress, .bucket = bucket, })); out: mutex_unlock(&ca->discard_buckets_in_flight_lock); return ret; } static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { mutex_lock(&ca->discard_buckets_in_flight_lock); darray_for_each(ca->discard_buckets_in_flight, i) if (i->bucket == bucket) { BUG_ON(!i->in_progress); darray_remove_item(&ca->discard_buckets_in_flight, i); goto found; } BUG(); found: mutex_unlock(&ca->discard_buckets_in_flight_lock); } struct discard_buckets_state { u64 seen; u64 open; u64 need_journal_commit; u64 discarded; u64 need_journal_commit_this_dev; }; static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; struct btree_iter iter = { NULL }; struct bkey_s_c k; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; bool discard_locked = false; int ret = 0; if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { s->need_journal_commit++; s->need_journal_commit_this_dev++; goto out; } k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, need_discard_iter->pos, BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto out; a = bch2_alloc_to_v4_mut(trans, k); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; if (bch2_bucket_sectors_total(a->v)) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "attempting to discard bucket with dirty data\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; goto out; } if (a->v.data_type != BCH_DATA_need_discard) { if (data_type_is_empty(a->v.data_type) && BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { a->v.gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); goto write; } if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "bucket incorrectly set in need_discard btree\n" "%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; goto out; } if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", a->v.journal_seq, c->journal.flushed_seq_ondisk, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; goto out; } if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; discard_locked = true; if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, GFP_KERNEL); *discard_pos_done = iter.pos; ret = bch2_trans_relock_notrace(trans); if (ret) goto out; } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); write: alloc_data_type_set(&a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; count_event(c, bucket_discard); s->discarded++; out: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } static void bch2_do_discards_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; /* * We're doing the commit in bch2_discard_one_bucket instead of using * for_each_btree_key_commit() so that we can increment counters after * successful commit: */ ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_need_discard, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX), 0, k, bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); percpu_ref_put(&ca->io_ref); bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) return; if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; percpu_ref_put(&ca->io_ref); put_write_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) { for_each_member_device(c, ca) bch2_dev_do_discards(ca); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) { struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) goto err; struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); ret = PTR_ERR_OR_ZERO(a); if (ret) goto err; BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); alloc_data_type_set(&a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } static void bch2_do_discards_fast_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); struct bch_fs *c = ca->fs; while (1) { bool got_bucket = false; u64 bucket; mutex_lock(&ca->discard_buckets_in_flight_lock); darray_for_each(ca->discard_buckets_in_flight, i) { if (i->in_progress) continue; got_bucket = true; bucket = i->bucket; i->in_progress = true; break; } mutex_unlock(&ca->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, bucket), ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc, bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); bch_err_fn(c, ret); discard_in_flight_remove(ca, bucket); if (ret) break; } percpu_ref_put(&ca->io_ref); bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) { struct bch_fs *c = ca->fs; if (discard_in_flight_add(ca, bucket, false)) return; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) return; if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; percpu_ref_put(&ca->io_ref); put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bucket(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); unsigned cached_sectors; int ret = 0; if (*nr_to_invalidate <= 0) return 1; if (!bch2_dev_bucket_exists(c, bucket)) { prt_str(&buf, "lru entry points to invalid bucket"); goto err; } if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; /* We expect harmless races here due to the btree write buffer: */ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); cached_sectors = a->v.cached_sectors; SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); a->v.gen++; a->v.data_type = 0; a->v.dirty_sectors = 0; a->v.stripe_sectors = 0; a->v.cached_sectors = 0; a->v.io_time[READ] = bch2_current_io_time(c, READ); a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: printbuf_exit(&buf); return ret; err: prt_str(&buf, "\n lru key: "); bch2_bkey_val_to_text(&buf, c, lru_k); prt_str(&buf, "\n lru entry: "); bch2_lru_pos_to_text(&buf, lru_iter->pos); prt_str(&buf, "\n alloc key: "); if (!a) bch2_bpos_to_text(&buf, bucket); else bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); bch_err(c, "%s", buf.buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { bch2_inconsistent_error(c); ret = -EINVAL; } goto out; } static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, struct bch_dev *ca, bool *wrapped) { struct bkey_s_c k; again: k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; goto again; } return k; } static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); struct btree_iter iter; bool wrapped = false; bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, ((bch2_current_io_time(c, READ) + U32_MAX) & LRU_TIME_MAX)), 0); while (true) { bch2_trans_begin(trans); struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); ret = bkey_err(k); if (ret) goto restart_err; if (!k.k) break; ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); restart_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; bch2_btree_iter_advance(&iter); } bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) return; if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; percpu_ref_put(&ca->io_ref); put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) { for_each_member_device(c, ca) bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, u64 bucket_start, u64 bucket_end) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bkey hole; struct bpos end = POS(ca->dev_idx, bucket_end); struct bch_member *m; unsigned long last_updated = jiffies; int ret; BUG_ON(bucket_start > bucket_end); BUG_ON(bucket_end > ca->mi.nbuckets); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), BTREE_ITER_prefetch); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { if (time_after(jiffies, last_updated + HZ * 10)) { bch_info(ca, "%s: currently at %llu/%llu", __func__, iter.pos.offset, ca->mi.nbuckets); last_updated = jiffies; } bch2_trans_begin(trans); if (bkey_ge(iter.pos, end)) { ret = 0; break; } k = bch2_get_key_or_hole(&iter, end, &hole); ret = bkey_err(k); if (ret) goto bkey_err; if (k.k->type) { /* * We process live keys in the alloc btree one at a * time: */ struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; bch2_btree_iter_advance(&iter); } else { struct bkey_i *freespace; freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); ret = PTR_ERR_OR_ZERO(freespace); if (ret) goto bkey_err; bkey_init(&freespace->k); freespace->k.type = KEY_TYPE_set; freespace->k.p = k.k->p; freespace->k.size = k.k->size; ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; bch2_btree_iter_set_pos(&iter, k.k->p); } bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; } bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); if (ret < 0) { bch_err_msg(ca, ret, "initializing free space"); return ret; } mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); mutex_unlock(&c->sb_lock); return 0; } int bch2_fs_freespace_init(struct bch_fs *c) { int ret = 0; bool doing_init = false; /* * We can crash during the device add path, so we need to check this on * every mount: */ for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; if (!doing_init) { bch_info(c, "initializing freespace"); doing_init = true; } ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } } if (doing_init) { mutex_lock(&c->sb_lock); bch2_write_super(c); mutex_unlock(&c->sb_lock); bch_verbose(c, "done initializing freespace"); } return 0; } /* device removal */ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) { struct bpos start = POS(ca->dev_idx, 0); struct bpos end = POS(ca->dev_idx, U64_MAX); int ret; /* * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: bch2_btree_delete_range(c, BTREE_ID_lru, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_dev_usage_remove(c, ca->dev_idx); bch_err_msg(ca, ret, "removing dev alloc info"); return ret; } /* Bucket IO clocks: */ static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); int ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; u64 now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; a->v.io_time[rw] = now; ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { if (bch2_trans_relock(trans)) bch2_trans_begin(trans); return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); } /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) { u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; lockdep_assert_held(&c->state_lock); for_each_online_member(c, ca) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; } bch2_set_ra_pages(c, ra_pages); for_each_rw_member(c, ca) { u64 dev_reserve = 0; /* * We need to reserve buckets (from the number * of currently available buckets) against * foreground writes so that mainly copygc can * make forward progress. * * We need enough to refill the various reserves * from scratch - copygc will use its entire * reserve all at once, then run against when * its reserve is refilled (from the formerly * available buckets). * * This reserve is just used when considering if * allocations for foreground writes must wait - * not -ENOSPC calculations. */ dev_reserve += ca->nr_btree_reserve * 2; dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ dev_reserve += 1; /* rebalance write point */ dev_reserve *= ca->mi.bucket_size; capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); reserved_sectors += dev_reserve * 2; bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); reserved_sectors = max(gc_reserve, reserved_sectors); reserved_sectors = min(reserved_sectors, capacity); c->reserved = reserved_sectors; c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; /* Wake up case someone was waiting for buckets */ closure_wake_up(&c->freelist_wait); } u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; for_each_rw_member(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); if (ob->valid && !ob->on_partial_list && ob->dev == ca->dev_idx) ret = true; spin_unlock(&ob->lock); } return ret; } /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) clear_bit(ca->dev_idx, c->rw_devs[i].d); c->rw_devs_change_count++; /* * Capacity is calculated based off of devices in allocation groups: */ bch2_recalc_capacity(c); bch2_open_buckets_stop(c, ca, false); /* * Wake up threads that were blocked on allocation, so they can notice * the device can no longer be removed and the capacity has changed: */ closure_wake_up(&c->freelist_wait); /* * journal_res_get() can block waiting for free space in the journal - * it needs to notice there may not be devices to allocate from anymore: */ wake_up(&c->journal.wait); /* Now wait for any in flight writes: */ closure_wait_event(&c->open_buckets_wait, !bch2_dev_has_open_write_point(c, ca)); } /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) if (ca->mi.data_allowed & (1 << i)) set_bit(ca->dev_idx, c->rw_devs[i].d); c->rw_devs_change_count++; } void bch2_dev_allocator_background_exit(struct bch_dev *ca) { darray_exit(&ca->discard_buckets_in_flight); } void bch2_dev_allocator_background_init(struct bch_dev *ca) { mutex_init(&ca->discard_buckets_in_flight_lock); INIT_WORK(&ca->discard_work, bch2_do_discards_work); INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); }
5 1 2 2 4 1 5 5 5 2 1 1 2 16 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 13 1 1 1 1 1 1 2 1 1 1 1 1 16 6 1 4 5 3 9 2 1 1 1 6 6 5 6 6 1 5 1 2 7 1 1 1 1 3 544 544 540 64 539 547 547 2 1 538 543 540 4 2 527 18 544 1 541 5 204 387 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 /* * Created: Fri Jan 8 09:01:26 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Author Rickard E. (Rik) Faith <faith@valinux.com> * Author Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/export.h> #include <linux/nospec.h> #include <linux/pci.h> #include <linux/uaccess.h> #include <drm/drm_auth.h> #include <drm/drm_crtc.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_ioctl.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: getunique and setversion story * * BEWARE THE DRAGONS! MIND THE TRAPDOORS! * * In an attempt to warn anyone else who's trying to figure out what's going * on here, I'll try to summarize the story. First things first, let's clear up * the names, because the kernel internals, libdrm and the ioctls are all named * differently: * * - GET_UNIQUE ioctl, implemented by drm_getunique is wrapped up in libdrm * through the drmGetBusid function. * - The libdrm drmSetBusid function is backed by the SET_UNIQUE ioctl. All * that code is nerved in the kernel with drm_invalid_op(). * - The internal set_busid kernel functions and driver callbacks are * exclusively use by the SET_VERSION ioctl, because only drm 1.0 (which is * nerved) allowed userspace to set the busid through the above ioctl. * - Other ioctls and functions involved are named consistently. * * For anyone wondering what's the difference between drm 1.1 and 1.4: Correctly * handling pci domains in the busid on ppc. Doing this correctly was only * implemented in libdrm in 2010, hence can't be nerved yet. No one knows what's * special with drm 1.2 and 1.3. * * Now the actual horror story of how device lookup in drm works. At large, * there's 2 different ways, either by busid, or by device driver name. * * Opening by busid is fairly simple: * * 1. First call SET_VERSION to make sure pci domains are handled properly. As a * side-effect this fills out the unique name in the master structure. * 2. Call GET_UNIQUE to read out the unique name from the master structure, * which matches the busid thanks to step 1. If it doesn't, proceed to try * the next device node. * * Opening by name is slightly different: * * 1. Directly call VERSION to get the version and to match against the driver * name returned by that ioctl. Note that SET_VERSION is not called, which * means the unique name for the master node just opening is _not_ filled * out. This despite that with current drm device nodes are always bound to * one device, and can't be runtime assigned like with drm 1.0. * 2. Match driver name. If it mismatches, proceed to the next device node. * 3. Call GET_UNIQUE, and check whether the unique name has length zero (by * checking that the first byte in the string is 0). If that's not the case * libdrm skips and proceeds to the next device node. Probably this is just * copypasta from drm 1.0 times where a set unique name meant that the driver * was in use already, but that's just conjecture. * * Long story short: To keep the open by name logic working, GET_UNIQUE must * _not_ return a unique string when SET_VERSION hasn't been called yet, * otherwise libdrm breaks. Even when that unique string can't ever change, and * is totally irrelevant for actually opening the device because runtime * assignable device instances were only support in drm 1.0, which is long dead. * But the libdrm code in drmOpenByName somehow survived, hence this can't be * broken. */ /* * Get the bus id. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_unique structure. * \return zero on success or a negative number on failure. * * Copies the bus id from drm_device::unique into user space. */ int drm_getunique(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_unique *u = data; struct drm_master *master; mutex_lock(&dev->master_mutex); master = file_priv->master; if (u->unique_len >= master->unique_len) { if (copy_to_user(u->unique, master->unique, master->unique_len)) { mutex_unlock(&dev->master_mutex); return -EFAULT; } } u->unique_len = master->unique_len; mutex_unlock(&dev->master_mutex); return 0; } static void drm_unset_busid(struct drm_device *dev, struct drm_master *master) { kfree(master->unique); master->unique = NULL; master->unique_len = 0; } static int drm_set_busid(struct drm_device *dev, struct drm_file *file_priv) { struct drm_master *master = file_priv->master; int ret; if (master->unique != NULL) drm_unset_busid(dev, master); if (dev->dev && dev_is_pci(dev->dev)) { ret = drm_pci_set_busid(dev, master); if (ret) { drm_unset_busid(dev, master); return ret; } } else { WARN_ON(!dev->unique); master->unique = kstrdup(dev->unique, GFP_KERNEL); if (master->unique) master->unique_len = strlen(dev->unique); } return 0; } /* * Get client information. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_client structure. * * \return zero on success or a negative number on failure. * * Searches for the client with the specified index and copies its information * into userspace */ int drm_getclient(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_client *client = data; /* * Hollowed-out getclient ioctl to keep some dead old drm tests/tools * not breaking completely. Userspace tools stop enumerating one they * get -EINVAL, hence this is the return value we need to hand back for * no clients tracked. * * Unfortunately some clients (*cough* libva *cough*) use this in a fun * attempt to figure out whether they're authenticated or not. Since * that's the only thing they care about, give it to the directly * instead of walking one giant list. */ if (client->idx == 0) { client->auth = file_priv->authenticated; client->pid = task_pid_vnr(current); client->uid = overflowuid; client->magic = 0; client->iocs = 0; return 0; } else { return -EINVAL; } } /* * Get statistics information. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_stats structure. * * \return zero on success or a negative number on failure. */ static int drm_getstats(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_stats *stats = data; /* Clear stats to prevent userspace from eating its stack garbage. */ memset(stats, 0, sizeof(*stats)); return 0; } /* * Get device/driver capabilities */ static int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_get_cap *req = data; struct drm_crtc *crtc; req->value = 0; /* Only some caps make sense with UMS/render-only drivers. */ switch (req->capability) { case DRM_CAP_TIMESTAMP_MONOTONIC: req->value = 1; return 0; case DRM_CAP_PRIME: req->value = DRM_PRIME_CAP_IMPORT | DRM_PRIME_CAP_EXPORT; return 0; case DRM_CAP_SYNCOBJ: req->value = drm_core_check_feature(dev, DRIVER_SYNCOBJ); return 0; case DRM_CAP_SYNCOBJ_TIMELINE: req->value = drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE); return 0; } /* Other caps only work with KMS drivers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; switch (req->capability) { case DRM_CAP_DUMB_BUFFER: if (dev->driver->dumb_create) req->value = 1; break; case DRM_CAP_VBLANK_HIGH_CRTC: req->value = 1; break; case DRM_CAP_DUMB_PREFERRED_DEPTH: req->value = dev->mode_config.preferred_depth; break; case DRM_CAP_DUMB_PREFER_SHADOW: req->value = dev->mode_config.prefer_shadow; break; case DRM_CAP_ASYNC_PAGE_FLIP: req->value = dev->mode_config.async_page_flip; break; case DRM_CAP_PAGE_FLIP_TARGET: req->value = 1; drm_for_each_crtc(crtc, dev) { if (!crtc->funcs->page_flip_target) req->value = 0; } break; case DRM_CAP_CURSOR_WIDTH: if (dev->mode_config.cursor_width) req->value = dev->mode_config.cursor_width; else req->value = 64; break; case DRM_CAP_CURSOR_HEIGHT: if (dev->mode_config.cursor_height) req->value = dev->mode_config.cursor_height; else req->value = 64; break; case DRM_CAP_ADDFB2_MODIFIERS: req->value = !dev->mode_config.fb_modifiers_not_supported; break; case DRM_CAP_CRTC_IN_VBLANK_EVENT: req->value = 1; break; case DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP: req->value = drm_core_check_feature(dev, DRIVER_ATOMIC) && dev->mode_config.async_page_flip; break; default: return -EINVAL; } return 0; } /* * Set device/driver capabilities */ static int drm_setclientcap(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_set_client_cap *req = data; /* No render-only settable capabilities for now */ /* Below caps that only works with KMS drivers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; switch (req->capability) { case DRM_CLIENT_CAP_STEREO_3D: if (req->value > 1) return -EINVAL; file_priv->stereo_allowed = req->value; break; case DRM_CLIENT_CAP_UNIVERSAL_PLANES: if (req->value > 1) return -EINVAL; file_priv->universal_planes = req->value; break; case DRM_CLIENT_CAP_ATOMIC: if (!drm_core_check_feature(dev, DRIVER_ATOMIC)) return -EOPNOTSUPP; /* The modesetting DDX has a totally broken idea of atomic. */ if (current->comm[0] == 'X' && req->value == 1) { pr_info("broken atomic modeset userspace detected, disabling atomic\n"); return -EOPNOTSUPP; } if (req->value > 2) return -EINVAL; file_priv->atomic = req->value; file_priv->universal_planes = req->value; /* * No atomic user-space blows up on aspect ratio mode bits. */ file_priv->aspect_ratio_allowed = req->value; break; case DRM_CLIENT_CAP_ASPECT_RATIO: if (req->value > 1) return -EINVAL; file_priv->aspect_ratio_allowed = req->value; break; case DRM_CLIENT_CAP_WRITEBACK_CONNECTORS: if (!file_priv->atomic) return -EINVAL; if (req->value > 1) return -EINVAL; file_priv->writeback_connectors = req->value; break; case DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT: if (!drm_core_check_feature(dev, DRIVER_CURSOR_HOTSPOT)) return -EOPNOTSUPP; if (!file_priv->atomic) return -EINVAL; if (req->value > 1) return -EINVAL; file_priv->supports_virtualized_cursor_plane = req->value; break; default: return -EINVAL; } return 0; } /* * Setversion ioctl. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument, pointing to a drm_lock structure. * \return zero on success or negative number on failure. * * Sets the requested interface version */ static int drm_setversion(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_set_version *sv = data; int if_version, retcode = 0; mutex_lock(&dev->master_mutex); if (sv->drm_di_major != -1) { if (sv->drm_di_major != DRM_IF_MAJOR || sv->drm_di_minor < 0 || sv->drm_di_minor > DRM_IF_MINOR) { retcode = -EINVAL; goto done; } if_version = DRM_IF_VERSION(sv->drm_di_major, sv->drm_di_minor); dev->if_version = max(if_version, dev->if_version); if (sv->drm_di_minor >= 1) { /* * Version 1.1 includes tying of DRM to specific device * Version 1.4 has proper PCI domain support */ retcode = drm_set_busid(dev, file_priv); if (retcode) goto done; } } if (sv->drm_dd_major != -1) { if (sv->drm_dd_major != dev->driver->major || sv->drm_dd_minor < 0 || sv->drm_dd_minor > dev->driver->minor) { retcode = -EINVAL; goto done; } } done: sv->drm_di_major = DRM_IF_MAJOR; sv->drm_di_minor = DRM_IF_MINOR; sv->drm_dd_major = dev->driver->major; sv->drm_dd_minor = dev->driver->minor; mutex_unlock(&dev->master_mutex); return retcode; } /** * drm_noop - DRM no-op ioctl implementation * @dev: DRM device for the ioctl * @data: data pointer for the ioctl * @file_priv: DRM file for the ioctl call * * This no-op implementation for drm ioctls is useful for deprecated * functionality where we can't return a failure code because existing userspace * checks the result of the ioctl, but doesn't care about the action. * * Always returns successfully with 0. */ int drm_noop(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_dbg_core(dev, "\n"); return 0; } EXPORT_SYMBOL(drm_noop); /** * drm_invalid_op - DRM invalid ioctl implementation * @dev: DRM device for the ioctl * @data: data pointer for the ioctl * @file_priv: DRM file for the ioctl call * * This no-op implementation for drm ioctls is useful for deprecated * functionality where we really don't want to allow userspace to call the ioctl * any more. This is the case for old ums interfaces for drivers that * transitioned to kms gradually and so kept the old legacy tables around. This * only applies to radeon and i915 kms drivers, other drivers shouldn't need to * use this function. * * Always fails with a return value of -EINVAL. */ int drm_invalid_op(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -EINVAL; } EXPORT_SYMBOL(drm_invalid_op); /* * Copy and IOCTL return string to user space */ static int drm_copy_field(char __user *buf, size_t *buf_len, const char *value) { size_t len; /* don't attempt to copy a NULL pointer */ if (WARN_ONCE(!value, "BUG: the value to copy was not set!")) { *buf_len = 0; return 0; } /* don't overflow userbuf */ len = strlen(value); if (len > *buf_len) len = *buf_len; /* let userspace know exact length of driver value (which could be * larger than the userspace-supplied buffer) */ *buf_len = strlen(value); /* finally, try filling in the userbuf */ if (len && buf) if (copy_to_user(buf, value, len)) return -EFAULT; return 0; } /* * Get version information * * \param inode device inode. * \param filp file pointer. * \param cmd command. * \param arg user argument, pointing to a drm_version structure. * \return zero on success or negative number on failure. * * Fills in the version information in \p arg. */ int drm_version(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_version *version = data; int err; version->version_major = dev->driver->major; version->version_minor = dev->driver->minor; version->version_patchlevel = dev->driver->patchlevel; err = drm_copy_field(version->name, &version->name_len, dev->driver->name); /* Driver date is deprecated. Userspace expects a non-empty string. */ if (!err) err = drm_copy_field(version->date, &version->date_len, "0"); if (!err) err = drm_copy_field(version->desc, &version->desc_len, dev->driver->desc); return err; } /* * Check if the passed string contains control char or spaces or * anything that would mess up a formatted output. */ static int drm_validate_value_string(const char *value, size_t len) { int i; for (i = 0; i < len; i++) { if (!isascii(value[i]) || !isgraph(value[i])) return -EINVAL; } return 0; } static int drm_set_client_name(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_set_client_name *name = data; size_t len = name->name_len; void __user *user_ptr; char *new_name; if (len > DRM_CLIENT_NAME_MAX_LEN) { return -EINVAL; } else if (len) { user_ptr = u64_to_user_ptr(name->name); new_name = memdup_user_nul(user_ptr, len); if (IS_ERR(new_name)) return PTR_ERR(new_name); if (strlen(new_name) != len || drm_validate_value_string(new_name, len) < 0) { kfree(new_name); return -EINVAL; } } else { new_name = NULL; } mutex_lock(&file_priv->client_name_lock); kfree(file_priv->client_name); file_priv->client_name = new_name; mutex_unlock(&file_priv->client_name_lock); return 0; } static int drm_ioctl_permit(u32 flags, struct drm_file *file_priv) { /* ROOT_ONLY is only for CAP_SYS_ADMIN */ if (unlikely((flags & DRM_ROOT_ONLY) && !capable(CAP_SYS_ADMIN))) return -EACCES; /* AUTH is only for authenticated or render client */ if (unlikely((flags & DRM_AUTH) && !drm_is_render_client(file_priv) && !file_priv->authenticated)) return -EACCES; /* MASTER is only for master or control clients */ if (unlikely((flags & DRM_MASTER) && !drm_is_current_master(file_priv))) return -EACCES; /* Render clients must be explicitly allowed */ if (unlikely(!(flags & DRM_RENDER_ALLOW) && drm_is_render_client(file_priv))) return -EACCES; return 0; } #define DRM_IOCTL_DEF(ioctl, _func, _flags) \ [DRM_IOCTL_NR(ioctl)] = { \ .cmd = ioctl, \ .func = _func, \ .flags = _flags, \ .name = #ioctl \ } /* Ioctl table */ static const struct drm_ioctl_desc drm_ioctls[] = { DRM_IOCTL_DEF(DRM_IOCTL_VERSION, drm_version, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_GET_UNIQUE, drm_getunique, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_MAGIC, drm_getmagic, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_CLIENT, drm_getclient, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_STATS, drm_getstats, 0), DRM_IOCTL_DEF(DRM_IOCTL_GET_CAP, drm_getcap, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SET_CLIENT_CAP, drm_setclientcap, 0), DRM_IOCTL_DEF(DRM_IOCTL_SET_VERSION, drm_setversion, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_SET_UNIQUE, drm_invalid_op, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_BLOCK, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_UNBLOCK, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_AUTH_MAGIC, drm_authmagic, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_SET_MASTER, drm_setmaster_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_DROP_MASTER, drm_dropmaster_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_ADD_DRAW, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_RM_DRAW, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_FINISH, drm_noop, DRM_AUTH), DRM_IOCTL_DEF(DRM_IOCTL_WAIT_VBLANK, drm_wait_vblank_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_UPDATE_DRAW, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), DRM_IOCTL_DEF(DRM_IOCTL_GEM_CLOSE, drm_gem_close_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_GEM_FLINK, drm_gem_flink_ioctl, DRM_AUTH), DRM_IOCTL_DEF(DRM_IOCTL_GEM_OPEN, drm_gem_open_ioctl, DRM_AUTH), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETRESOURCES, drm_mode_getresources, 0), DRM_IOCTL_DEF(DRM_IOCTL_PRIME_HANDLE_TO_FD, drm_prime_handle_to_fd_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_PRIME_FD_TO_HANDLE, drm_prime_fd_to_handle_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SET_CLIENT_NAME, drm_set_client_name, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPLANERESOURCES, drm_mode_getplane_res, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETCRTC, drm_mode_getcrtc, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETCRTC, drm_mode_setcrtc, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPLANE, drm_mode_getplane, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETPLANE, drm_mode_setplane, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CURSOR, drm_mode_cursor_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETGAMMA, drm_mode_gamma_get_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETGAMMA, drm_mode_gamma_set_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETENCODER, drm_mode_getencoder, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETCONNECTOR, drm_mode_getconnector, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ATTACHMODE, drm_noop, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DETACHMODE, drm_noop, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPROPERTY, drm_mode_getproperty_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETPROPERTY, drm_connector_property_set_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPROPBLOB, drm_mode_getblob_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETFB, drm_mode_getfb, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETFB2, drm_mode_getfb2_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ADDFB, drm_mode_addfb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ADDFB2, drm_mode_addfb2_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_RMFB, drm_mode_rmfb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CLOSEFB, drm_mode_closefb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_PAGE_FLIP, drm_mode_page_flip_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DIRTYFB, drm_mode_dirtyfb_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATE_DUMB, drm_mode_create_dumb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_MAP_DUMB, drm_mode_mmap_dumb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DESTROY_DUMB, drm_mode_destroy_dumb_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_OBJ_GETPROPERTIES, drm_mode_obj_get_properties_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_OBJ_SETPROPERTY, drm_mode_obj_set_property_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CURSOR2, drm_mode_cursor2_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_ATOMIC, drm_mode_atomic_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATEPROPBLOB, drm_mode_createblob_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_DESTROYPROPBLOB, drm_mode_destroyblob_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_CREATE, drm_syncobj_create_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_DESTROY, drm_syncobj_destroy_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, drm_syncobj_handle_to_fd_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, drm_syncobj_fd_to_handle_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_TRANSFER, drm_syncobj_transfer_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_WAIT, drm_syncobj_wait_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, drm_syncobj_timeline_wait_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_EVENTFD, drm_syncobj_eventfd_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_RESET, drm_syncobj_reset_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_SIGNAL, drm_syncobj_signal_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, drm_syncobj_timeline_signal_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_SYNCOBJ_QUERY, drm_syncobj_query_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF(DRM_IOCTL_CRTC_GET_SEQUENCE, drm_crtc_get_sequence_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_CRTC_QUEUE_SEQUENCE, drm_crtc_queue_sequence_ioctl, 0), DRM_IOCTL_DEF(DRM_IOCTL_MODE_CREATE_LEASE, drm_mode_create_lease_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_LIST_LESSEES, drm_mode_list_lessees_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_GET_LEASE, drm_mode_get_lease_ioctl, DRM_MASTER), DRM_IOCTL_DEF(DRM_IOCTL_MODE_REVOKE_LEASE, drm_mode_revoke_lease_ioctl, DRM_MASTER), }; #define DRM_CORE_IOCTL_COUNT ARRAY_SIZE(drm_ioctls) /** * DOC: driver specific ioctls * * First things first, driver private IOCTLs should only be needed for drivers * supporting rendering. Kernel modesetting is all standardized, and extended * through properties. There are a few exceptions in some existing drivers, * which define IOCTL for use by the display DRM master, but they all predate * properties. * * Now if you do have a render driver you always have to support it through * driver private properties. There's a few steps needed to wire all the things * up. * * First you need to define the structure for your IOCTL in your driver private * UAPI header in ``include/uapi/drm/my_driver_drm.h``:: * * struct my_driver_operation { * u32 some_thing; * u32 another_thing; * }; * * Please make sure that you follow all the best practices from * ``Documentation/process/botching-up-ioctls.rst``. Note that drm_ioctl() * automatically zero-extends structures, hence make sure you can add more stuff * at the end, i.e. don't put a variable sized array there. * * Then you need to define your IOCTL number, using one of DRM_IO(), DRM_IOR(), * DRM_IOW() or DRM_IOWR(). It must start with the DRM_IOCTL\_ prefix:: * * ##define DRM_IOCTL_MY_DRIVER_OPERATION \ * DRM_IOW(DRM_COMMAND_BASE, struct my_driver_operation) * * DRM driver private IOCTL must be in the range from DRM_COMMAND_BASE to * DRM_COMMAND_END. Finally you need an array of &struct drm_ioctl_desc to wire * up the handlers and set the access rights:: * * static const struct drm_ioctl_desc my_driver_ioctls[] = { * DRM_IOCTL_DEF_DRV(MY_DRIVER_OPERATION, my_driver_operation, * DRM_AUTH|DRM_RENDER_ALLOW), * }; * * And then assign this to the &drm_driver.ioctls field in your driver * structure. * * See the separate chapter on :ref:`file operations<drm_driver_fops>` for how * the driver-specific IOCTLs are wired up. */ long drm_ioctl_kernel(struct file *file, drm_ioctl_t *func, void *kdata, u32 flags) { struct drm_file *file_priv = file->private_data; struct drm_device *dev = file_priv->minor->dev; int ret; /* Update drm_file owner if fd was passed along. */ drm_file_update_pid(file_priv); if (drm_dev_is_unplugged(dev)) return -ENODEV; ret = drm_ioctl_permit(flags, file_priv); if (unlikely(ret)) return ret; return func(dev, kdata, file_priv); } EXPORT_SYMBOL(drm_ioctl_kernel); /** * drm_ioctl - ioctl callback implementation for DRM drivers * @filp: file this ioctl is called on * @cmd: ioctl cmd number * @arg: user argument * * Looks up the ioctl function in the DRM core and the driver dispatch table, * stored in &drm_driver.ioctls. It checks for necessary permission by calling * drm_ioctl_permit(), and dispatches to the respective function. * * Returns: * Zero on success, negative error code on failure. */ long drm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev; const struct drm_ioctl_desc *ioctl = NULL; drm_ioctl_t *func; unsigned int nr = DRM_IOCTL_NR(cmd); int retcode = -EINVAL; char stack_kdata[128]; char *kdata = NULL; unsigned int in_size, out_size, drv_size, ksize; bool is_driver_ioctl; dev = file_priv->minor->dev; if (drm_dev_is_unplugged(dev)) return -ENODEV; if (DRM_IOCTL_TYPE(cmd) != DRM_IOCTL_BASE) return -ENOTTY; is_driver_ioctl = nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END; if (is_driver_ioctl) { /* driver ioctl */ unsigned int index = nr - DRM_COMMAND_BASE; if (index >= dev->driver->num_ioctls) goto err_i1; index = array_index_nospec(index, dev->driver->num_ioctls); ioctl = &dev->driver->ioctls[index]; } else { /* core ioctl */ if (nr >= DRM_CORE_IOCTL_COUNT) goto err_i1; nr = array_index_nospec(nr, DRM_CORE_IOCTL_COUNT); ioctl = &drm_ioctls[nr]; } drv_size = _IOC_SIZE(ioctl->cmd); out_size = in_size = _IOC_SIZE(cmd); if ((cmd & ioctl->cmd & IOC_IN) == 0) in_size = 0; if ((cmd & ioctl->cmd & IOC_OUT) == 0) out_size = 0; ksize = max(max(in_size, out_size), drv_size); drm_dbg_core(dev, "comm=\"%s\" pid=%d, dev=0x%lx, auth=%d, %s\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file_priv->minor->kdev->devt), file_priv->authenticated, ioctl->name); /* Do not trust userspace, use our own definition */ func = ioctl->func; if (unlikely(!func)) { drm_dbg_core(dev, "no function\n"); retcode = -EINVAL; goto err_i1; } if (ksize <= sizeof(stack_kdata)) { kdata = stack_kdata; } else { kdata = kmalloc(ksize, GFP_KERNEL); if (!kdata) { retcode = -ENOMEM; goto err_i1; } } if (copy_from_user(kdata, (void __user *)arg, in_size) != 0) { retcode = -EFAULT; goto err_i1; } if (ksize > in_size) memset(kdata + in_size, 0, ksize - in_size); retcode = drm_ioctl_kernel(filp, func, kdata, ioctl->flags); if (copy_to_user((void __user *)arg, kdata, out_size) != 0) retcode = -EFAULT; err_i1: if (!ioctl) drm_dbg_core(dev, "invalid ioctl: comm=\"%s\", pid=%d, dev=0x%lx, auth=%d, cmd=0x%02x, nr=0x%02x\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file_priv->minor->kdev->devt), file_priv->authenticated, cmd, nr); if (kdata != stack_kdata) kfree(kdata); if (retcode) drm_dbg_core(dev, "comm=\"%s\", pid=%d, ret=%d\n", current->comm, task_pid_nr(current), retcode); return retcode; } EXPORT_SYMBOL(drm_ioctl); /** * drm_ioctl_flags - Check for core ioctl and return ioctl permission flags * @nr: ioctl number * @flags: where to return the ioctl permission flags * * This ioctl is only used by the vmwgfx driver to augment the access checks * done by the drm core and insofar a pretty decent layering violation. This * shouldn't be used by any drivers. * * Returns: * True if the @nr corresponds to a DRM core ioctl number, false otherwise. */ bool drm_ioctl_flags(unsigned int nr, unsigned int *flags) { if (nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END) return false; if (nr >= DRM_CORE_IOCTL_COUNT) return false; nr = array_index_nospec(nr, DRM_CORE_IOCTL_COUNT); *flags = drm_ioctls[nr].flags; return true; } EXPORT_SYMBOL(drm_ioctl_flags);
40 4 4 4 4 4 3 2 37 37 40 40 23 18 21 40 40 40 3 37 3 3 1 3 37 34 1 37 7 42 1 1 3 1 37 1 7 7 7 7 153 152 122 123 123 41 381 380 8 1 382 382 380 432 44 7 409 121 292 24 45 295 309 230 9 230 1 219 12 228 4 3 2 375 376 232 232 19 7 4 9 3 22 23 1 6 23 6 21 9 10 4 14 14 50 5 4 6 35 2 37 12 7 5 5 209 209 117 50 46 253 240 15 3 24 230 88 170 195 39 213 135 9 3 2 127 5 1 120 124 11 2 3 108 4 2 52 50 97 6 4 2 2 2 62 59 4 5 5 5 5 7 38 38 1 109 10 109 2 119 14 86 24 1 105 18 45 358 4 23 1 3 118 97 19 27 7 5 121 382 45 1 382 382 368 393 1 28 51 378 329 45 380 17 8 3 3 2 1 1 3 33 33 10 2 23 25 19 19 18 19 19 10 8 2 2 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 /* * An async IO implementation for Linux * Written by Benjamin LaHaise <bcrl@kvack.org> * * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/aio_abi.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/backing-dev.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> #include <linux/highmem.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/migrate.h> #include <linux/ramfs.h> #include <linux/percpu-refcount.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include "internal.h" #define KIOCB_KEY 0 #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; unsigned compat_features; unsigned incompat_features; unsigned header_length; /* size of aio_ring */ struct io_event io_events[]; }; /* 128 bytes + ring size */ /* * Plugging is meant to work with larger batches of IOs. If we don't * have more than the below, then don't bother setting up a plug. */ #define AIO_PLUG_THRESHOLD 2 #define AIO_RING_PAGES 8 struct kioctx_table { struct rcu_head rcu; unsigned nr; struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { unsigned reqs_available; }; struct ctx_rq_wait { struct completion comp; atomic_t count; }; struct kioctx { struct percpu_ref users; atomic_t dead; struct percpu_ref reqs; unsigned long user_id; struct kioctx_cpu __percpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global * counter at a time: */ unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. * * The real limit is nr_events - 1, which will be larger (see * aio_setup_ring()) */ unsigned max_reqs; /* Size of ringbuffer, in units of struct io_event */ unsigned nr_events; unsigned long mmap_base; unsigned long mmap_size; struct folio **ring_folios; long nr_pages; struct rcu_work free_rwork; /* see free_ioctx() */ /* * signals when all in-flight requests are done */ struct ctx_rq_wait *rq_wait; struct { /* * This counts the number of available slots in the ringbuffer, * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. * * We batch accesses to it with a percpu version. */ atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { spinlock_t ctx_lock; struct list_head active_reqs; /* used for cancellation */ } ____cacheline_aligned_in_smp; struct { struct mutex ring_lock; wait_queue_head_t wait; } ____cacheline_aligned_in_smp; struct { unsigned tail; unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; struct folio *internal_folios[AIO_RING_PAGES]; struct file *aio_ring_file; unsigned id; }; /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> */ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; struct cred *creds; }; struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; bool cancelled; bool work_scheduled; bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can * access the file pointer through any of the sub-structs, * or directly as just 'ki_filp' in this struct. */ struct aio_kiocb { union { struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; }; struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ refcount_t ki_refcnt; /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. */ struct eventfd_ctx *ki_eventfd; }; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); static unsigned long aio_nr; /* current system wide number of aio requests */ static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ #ifdef CONFIG_SYSCTL static struct ctl_table aio_sysctls[] = { { .procname = "aio-nr", .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, { .procname = "aio-max-nr", .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; static void __init aio_sysctl_init(void) { register_sysctl_init("fs", aio_sysctls); } #else #define aio_sysctl_init() do { } while (0) #endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; static struct vfsmount *aio_mnt; static const struct file_operations aio_ring_fops; static const struct address_space_operations aio_ctx_aops; static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) { struct file *file; struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", O_RDWR, &aio_ring_fops); if (IS_ERR(file)) iput(inode); return file; } static int aio_init_fs_context(struct fs_context *fc) { if (!init_pseudo(fc, AIO_RING_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; return 0; } /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. */ static int __init aio_setup(void) { static struct file_system_type aio_fs = { .name = "aio", .init_fs_context = aio_init_fs_context, .kill_sb = kill_anon_super, }; aio_mnt = kern_mount(&aio_fs); if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); aio_sysctl_init(); return 0; } __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; struct address_space *i_mapping; if (aio_ring_file) { truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->i_private_lock); i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } } static void aio_free_ring(struct kioctx *ctx) { int i; /* Disconnect the kiotx from the ring file. This prevents future * accesses to the kioctx from page migration. */ put_aio_ring_file(ctx); for (i = 0; i < ctx->nr_pages; i++) { struct folio *folio = ctx->ring_folios[i]; if (!folio) continue; pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i, folio_ref_count(folio)); ctx->ring_folios[i] = NULL; folio_put(folio); } if (ctx->ring_folios && ctx->ring_folios != ctx->internal_folios) { kfree(ctx->ring_folios); ctx->ring_folios = NULL; } } static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table) goto out_unlock; for (i = 0; i < table->nr; i++) { struct kioctx *ctx; ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; res = 0; } break; } } out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; } static const struct vm_operations_struct aio_ring_vm_ops = { .mremap = aio_ring_mremap, #if IS_ENABLED(CONFIG_MMU) .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, #endif }; static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &aio_ring_vm_ops; return 0; } static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, }; #if IS_ENABLED(CONFIG_MIGRATION) static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; pgoff_t idx; int rc = 0; /* mapping->i_private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->i_private_lock); ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; } /* The ring_lock mutex. The prevents aio_read_events() from writing * to the ring's head, and prevents page migration from mucking in * a partially initialized kiotx. */ if (!mutex_trylock(&ctx->ring_lock)) { rc = -EAGAIN; goto out; } idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { /* Make sure the old folio hasn't already been changed */ if (ctx->ring_folios[idx] != src) rc = -EAGAIN; } else rc = -EINVAL; if (rc != 0) goto out_unlock; /* Writeback must be complete */ BUG_ON(folio_test_writeback(src)); folio_get(dst); rc = folio_migrate_mapping(mapping, dst, src, 1); if (rc != MIGRATEPAGE_SUCCESS) { folio_put(dst); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer * while the old folio is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); folio_copy(dst, src); folio_migrate_flags(dst, src); BUG_ON(ctx->ring_folios[idx] != src); ctx->ring_folios[idx] = dst; spin_unlock_irqrestore(&ctx->completion_lock, flags); /* The old folio is no longer accessible. */ folio_put(src); out_unlock: mutex_unlock(&ctx->ring_lock); out: spin_unlock(&mapping->i_private_lock); return rc; } #else #define aio_migrate_folio NULL #endif static const struct address_space_operations aio_ctx_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = aio_migrate_folio, }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; int i; struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; return -ENOMEM; } ctx->aio_ring_file = file; nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ctx->ring_folios = ctx->internal_folios; if (nr_pages > AIO_RING_PAGES) { ctx->ring_folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_KERNEL); if (!ctx->ring_folios) { put_aio_ring_file(ctx); return -ENOMEM; } } for (i = 0; i < nr_pages; i++) { struct folio *folio; folio = __filemap_get_folio(file->f_mapping, i, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_USER | __GFP_ZERO); if (IS_ERR(folio)) break; pr_debug("pid(%d) [%d] folio->count=%d\n", current->pid, i, folio_ref_count(folio)); folio_end_read(folio, true); ctx->ring_folios[i] = folio; } ctx->nr_pages = i; if (unlikely(i != nr_pages)) { aio_free_ring(ctx); return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); if (mmap_write_lock_killable(mm)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -EINTR; } ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, 0, &unused, NULL); mmap_write_unlock(mm); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = folio_address(ctx->ring_folios[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); flush_dcache_folio(ctx->ring_folios[0]); return 0; } #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { struct aio_kiocb *req; struct kioctx *ctx; unsigned long flags; /* * kiocb didn't come from aio or is neither a read nor a write, hence * ignore it. */ if (!(iocb->ki_flags & IOCB_AIO_RW)) return; req = container_of(iocb, struct aio_kiocb, rw); if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return; ctx = req->ki_ctx; spin_lock_irqsave(&ctx->ctx_lock, flags); list_add_tail(&req->ki_list, &ctx->active_reqs); req->ki_cancel = cancel; spin_unlock_irqrestore(&ctx->ctx_lock, flags); } EXPORT_SYMBOL(kiocb_set_cancel_fn); /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call * aio_free_ring(). Use rcu_work. */ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); queue_rcu_work(system_wq, &ctx->free_rwork); } /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); req->ki_cancel(&req->rw); list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); percpu_ref_kill(&ctx->reqs); percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { unsigned i, new_nr; struct kioctx_table *table, *old; struct aio_ring *ring; spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) for (i = 0; i < table->nr; i++) if (!rcu_access_pointer(table->table[i])) { ctx->id = i; rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, * we are protected from page migration * changes ring_folios by ->ring_lock. */ ring = folio_address(ctx->ring_folios[0]); ring->id = ctx->id; return 0; } new_nr = (table ? table->nr : 1) * 4; spin_unlock(&mm->ioctx_lock); table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL); if (!table) return -ENOMEM; table->nr = new_nr; spin_lock(&mm->ioctx_lock); old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); } else if (table->nr > old->nr) { memcpy(table->table, old->table, old->nr * sizeof(struct kioctx *)); rcu_assign_pointer(mm->ioctx_table, table); kfree_rcu(old, rcu); } else { kfree(table); table = old; } } } static void aio_nr_sub(unsigned nr) { spin_lock(&aio_nr_lock); if (WARN_ON(aio_nr - nr > aio_nr)) aio_nr = 0; else aio_nr -= nr; spin_unlock(&aio_nr_lock); } /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; /* * Store the original nr_events -- what userspace passed to io_setup(), * for counting against the global limit -- before it changes. */ unsigned int max_reqs = nr_events; /* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * * So since up to half the slots might be on other cpu's percpu counters * and unavailable, double nr_events so userspace sees what they * expected: additionally, we move req_batch slots to/from percpu * counters at a time, so make sure that isn't 0: */ nr_events = max(nr_events, num_possible_cpus() * 4); nr_events *= 2; /* Prevent overflows */ if (nr_events > (0x10000000U / sizeof(struct io_event))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL); } if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ctx->max_reqs = max_reqs; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); /* Protect against page migration throughout kiotx setup by keeping * the ring_lock mutex held until setup is complete. */ mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); if (ctx->req_batch < 1) ctx->req_batch = 1; /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + ctx->max_reqs > aio_max_nr || aio_nr + ctx->max_reqs < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) goto err_cleanup; /* Release the ring_lock mutex now that all setup is complete. */ mutex_unlock(&ctx->ring_lock); pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: atomic_set(&ctx->dead, 1); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); percpu_ref_exit(&ctx->reqs); percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, struct ctx_rq_wait *wait) { struct kioctx_table *table; spin_lock(&mm->ioctx_lock); if (atomic_xchg(&ctx->dead, 1)) { spin_unlock(&mm->ioctx_lock); return -EINVAL; } table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all * the outstanding kiocbs have finished - but by then io_destroy * has already returned, so io_setup() could potentially return * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be * called on the context. * * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); struct ctx_rq_wait wait; int i, skipped; if (!table) return; atomic_set(&wait.count, table->nr); init_completion(&wait.comp); skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; continue; } /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, * this is not necessarily our ->mm. * Since kill_ioctx() uses non-zero ->mmap_size as indicator * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; kill_ioctx(mm, ctx, &wait); } if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } local_irq_restore(flags); } static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; unsigned long flags; local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; } while (!atomic_try_cmpxchg(&ctx->reqs_available, &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } ret = true; kcpu->reqs_available--; out: local_irq_restore(flags); return ret; } /* refill_reqs_available * Updates the reqs_available reference counts used for tracking the * number of free slots in the completion ring. This can be called * from aio_complete() (to optimistically update reqs_available) or * from aio_get_req() (the we're out of events case). It must be * called holding ctx->completion_lock. */ static void refill_reqs_available(struct kioctx *ctx, unsigned head, unsigned tail) { unsigned events_in_ring, completed; /* Clamp head since userland can write to it. */ head %= ctx->nr_events; if (head <= tail) events_in_ring = tail - head; else events_in_ring = ctx->nr_events - (head - tail); completed = ctx->completed_events; if (events_in_ring < completed) completed -= events_in_ring; else completed = 0; if (!completed) return; ctx->completed_events -= completed; put_reqs_available(ctx, completed); } /* user_refill_reqs_available * Called to refill reqs_available when aio_get_req() encounters an * out of space in the completion ring. */ static void user_refill_reqs_available(struct kioctx *ctx) { spin_lock_irq(&ctx->completion_lock); if (ctx->completed_events) { struct aio_ring *ring; unsigned head; /* Access of ring->head may race with aio_read_events_ring() * here, but that's okay since whether we read the old version * or the new version, and either will be valid. The important * part is that head cannot pass tail since we prevent * aio_complete() from updating tail by holding * ctx->completion_lock. Even if head is invalid, the check * against ctx->completed_events below will make sure we do the * safe/right thing. */ ring = folio_address(ctx->ring_folios[0]); head = ring->head; refill_reqs_available(ctx, head, ctx->tail); } spin_unlock_irq(&ctx->completion_lock); } static bool get_reqs_available(struct kioctx *ctx) { if (__get_reqs_available(ctx)) return true; user_refill_reqs_available(ctx); return __get_reqs_available(ctx); } /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. * * The refcount is initialized to 2 - one for the async op completion, * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) return NULL; if (unlikely(!get_reqs_available(ctx))) { kmem_cache_free(kiocb_cachep, req); return NULL; } percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; struct kioctx_table *table; unsigned id; if (get_user(id, &ring->id)) return NULL; rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); if (!table || id >= table->nr) goto out; id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) ret = ctx; } out: rcu_read_unlock(); return ret; } static inline void iocb_destroy(struct aio_kiocb *iocb) { if (iocb->ki_eventfd) eventfd_ctx_put(iocb->ki_eventfd); if (iocb->ki_filp) fput(iocb->ki_filp); percpu_ref_put(&iocb->ki_ctx->reqs); kmem_cache_free(kiocb_cachep, iocb); } struct aio_waiter { struct wait_queue_entry w; size_t min_nr; }; /* aio_complete * Called when the io request on the given iocb is complete. */ static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; unsigned tail, pos, head, avail; unsigned long flags; /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. */ spin_lock_irqsave(&ctx->completion_lock, flags); tail = ctx->tail; pos = tail + AIO_EVENTS_OFFSET; if (++tail >= ctx->nr_events) tail = 0; ev_page = folio_address(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; flush_dcache_folio(ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, (void __user *)(unsigned long)iocb->ki_res.obj, iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ ctx->tail = tail; ring = folio_address(ctx->ring_folios[0]); head = ring->head; ring->tail = tail; flush_dcache_folio(ctx->ring_folios[0]); ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); avail = tail > head ? tail - head : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test * of the wait list below outside the wait lock. This is * like in wake_up_bit() where clearing a bit has to be * ordered with the unlocked test. */ smp_mb(); if (waitqueue_active(&ctx->wait)) { struct aio_waiter *curr, *next; unsigned long flags; spin_lock_irqsave(&ctx->wait.lock, flags); list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) if (avail >= curr->min_nr) { wake_up_process(curr->w.private); list_del_init_careful(&curr->w.entry); } spin_unlock_irqrestore(&ctx->wait.lock, flags); } } static inline void iocb_put(struct aio_kiocb *iocb) { if (refcount_dec_and_test(&iocb->ki_refcnt)) { aio_complete(iocb); iocb_destroy(iocb); } } /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; unsigned head, tail, pos; long ret = 0; int copy_ret; /* * The mutex can block and wake us up and that will cause * wait_event_interruptible_hrtimeout() to schedule without sleeping * and repeat. This should be rare enough that it doesn't cause * peformance issues. See the comment in read_events() for more detail. */ sched_annotate_sleep(); mutex_lock(&ctx->ring_lock); /* Access to ->ring_folios here is protected by ctx->ring_lock. */ ring = folio_address(ctx->ring_folios[0]); head = ring->head; tail = ring->tail; /* * Ensure that once we've read the current tail pointer, that * we also see the events that were stored up to the tail. */ smp_rmb(); pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) goto out; head %= ctx->nr_events; tail %= ctx->nr_events; while (ret < nr) { long avail; struct io_event *ev; struct folio *folio; avail = (head <= tail ? tail : ctx->nr_events) - head; if (head == tail) break; pos = head + AIO_EVENTS_OFFSET; folio = ctx->ring_folios[pos / AIO_EVENTS_PER_PAGE]; pos %= AIO_EVENTS_PER_PAGE; avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); ev = folio_address(folio); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); if (unlikely(copy_ret)) { ret = -EFAULT; goto out; } ret += avail; head += avail; head %= ctx->nr_events; } ring = folio_address(ctx->ring_folios[0]); ring->head = head; flush_dcache_folio(ctx->ring_folios[0]); pr_debug("%li h%u t%u\n", ret, head, tail); out: mutex_unlock(&ctx->ring_lock); return ret; } static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, long *i) { long ret = aio_read_events_ring(ctx, event + *i, nr - *i); if (ret > 0) *i += ret; if (unlikely(atomic_read(&ctx->dead))) ret = -EINVAL; if (!*i) *i = ret; return ret < 0 || *i >= min_nr; } static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { struct hrtimer_sleeper t; struct aio_waiter w; long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to * TASK_INTERRUPTIBLE. * * But aio_read_events() can block, and if it blocks it's going to flip * the task state back to TASK_RUNNING. * * This should be ok, provided it doesn't flip the state back to * TASK_RUNNING and return 0 too much - that causes us to spin. That * will only happen if the mutex_lock() call blocks, and we then find * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ aio_read_events(ctx, min_nr, nr, event, &ret); if (until == 0 || ret < 0 || ret >= min_nr) return ret; hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (until != KTIME_MAX) { hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); } init_wait(&w.w); while (1) { unsigned long nr_got = ret; w.min_nr = min_nr - ret; ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); if (!ret2 && !t.task) ret2 = -ETIME; if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) break; if (nr_got == ret) schedule(); } finish_wait(&ctx->wait, &w.w); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); return ret; } /* sys_io_setup: * Create an aio_context capable of receiving at least nr_events. * ctxp must not point to an aio_context that already exists, and * must be initialized to 0 prior to the call. On successful * creation of the aio_context, *ctxp is filled in with the resulting * handle. May fail with -EINVAL if *ctxp is not initialized, * if the specified nr_events exceeds internal limits. May fail * with -EAGAIN if the specified nr_events exceeds the user's limit * of available events. May fail with -ENOMEM if insufficient kernel * resources are available. May fail with -EFAULT if an invalid * pointer is passed for ctxp. Will fail with -ENOSYS if not * implemented. */ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctxp); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) { struct kioctx *ioctx = NULL; unsigned long ctx; long ret; ret = get_user(ctx, ctx32p); if (unlikely(ret)) goto out; ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } ioctx = ioctx_alloc(nr_events); ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { /* truncating is ok because it's a user address */ ret = put_user((u32)ioctx->user_id, ctx32p); if (ret) kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } out: return ret; } #endif /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { struct ctx_rq_wait wait; int ret; init_completion(&wait.comp); atomic_set(&wait.count, 1); /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel * keep using user-space buffers even if user thinks the context * is destroyed. */ if (!ret) wait_for_completion(&wait.comp); return ret; } pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } static void aio_remove_iocb(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; unsigned long flags; spin_lock_irqsave(&ctx->ctx_lock, flags); list_del(&iocb->ki_list); spin_unlock_irqrestore(&ctx->ctx_lock, flags); } static void aio_complete_rw(struct kiocb *kiocb, long res) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); if (!list_empty_careful(&iocb->ki_list)) aio_remove_iocb(iocb); if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); if (S_ISREG(inode->i_mode)) kiocb_end_write(kiocb); } iocb->ki_res.res = res; iocb->ki_res.res2 = 0; iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then * aio_reqprio is interpreted as an I/O scheduling * class and priority. */ ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); return ret; } req->ki_ioprio = iocb->aio_reqprio; } else req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; } static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; size_t len = iocb->aio_nbytes; if (!vectored) { ssize_t ret = import_ubuf(rw, buf, len, iter); *iovec = NULL; return ret; } return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { case -EIOCBQUEUED: break; case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTARTNOHAND: case -ERESTART_RESTARTBLOCK: /* * There's no easy way to restart the syscall since other AIO's * may be already running. Just fail this IO with EINTR. */ ret = -EINTR; fallthrough; default: req->ki_complete(req, ret); } } static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; if (unlikely(!file->f_op->read_iter)) return -EINVAL; ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, file->f_op->read_iter(req, &iter)); kfree(iovec); return ret; } static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; int ret; ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(!file->f_op->write_iter)) return -EINVAL; ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { if (S_ISREG(file_inode(file)->i_mode)) kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, file->f_op->write_iter(req, &iter)); } kfree(iovec); return ret; } static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); revert_creds(old_cred); put_cred(iocb->fsync.creds); iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; if (unlikely(!req->file->f_op->fsync)) return -EINVAL; req->creds = prepare_creds(); if (!req->creds) return -ENOMEM; req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); return 0; } static void aio_poll_put_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); iocb_put(iocb); } /* * Safely lock the waitqueue which the request is on, synchronizing with the * case where the ->poll() provider decides to free its waitqueue early. * * Returns true on success, meaning that req->head->lock was locked, req->wait * is on req->head, and an RCU read lock was taken. Returns false if the * request was already removed from its waitqueue (which might no longer exist). */ static bool poll_iocb_lock_wq(struct poll_iocb *req) { wait_queue_head_t *head; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us, then check whether the request is still on the queue. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); head = smp_load_acquire(&req->head); if (head) { spin_lock(&head->lock); if (!list_empty(&req->wait.entry)) return true; spin_unlock(&head->lock); } rcu_read_unlock(); return false; } static void poll_iocb_unlock_wq(struct poll_iocb *req) { spin_unlock(&req->head->lock); rcu_read_unlock(); } static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); struct poll_table_struct pt = { ._key = req->events }; struct kioctx *ctx = iocb->ki_ctx; __poll_t mask = 0; if (!READ_ONCE(req->cancelled)) mask = vfs_poll(req->file, &pt) & req->events; /* * Note that ->ki_cancel callers also delete iocb from active_reqs after * calling ->ki_cancel. We need the ctx_lock roundtrip here to * synchronize with them. In the cancellation case the list_del_init * itself is not actually needed, but harmless so we keep it in to * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); if (poll_iocb_lock_wq(req)) { if (!mask && !READ_ONCE(req->cancelled)) { /* * The request isn't actually ready to be completed yet. * Reschedule completion if another wakeup came in. */ if (req->work_need_resched) { schedule_work(&req->work); req->work_need_resched = false; } else { req->work_scheduled = false; } poll_iocb_unlock_wq(req); spin_unlock_irq(&ctx->ctx_lock); return; } list_del_init(&req->wait.entry); poll_iocb_unlock_wq(req); } /* else, POLLFREE has freed the waitqueue, so we must complete */ list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); } /* assumes we are called with irqs disabled */ static int aio_poll_cancel(struct kiocb *iocb) { struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); struct poll_iocb *req = &aiocb->poll; if (poll_iocb_lock_wq(req)) { WRITE_ONCE(req->cancelled, true); if (!req->work_scheduled) { schedule_work(&aiocb->poll.work); req->work_scheduled = true; } poll_iocb_unlock_wq(req); } /* else, the request was force-cancelled by POLLFREE already */ return 0; } static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & req->events)) return 0; /* * Complete the request inline if possible. This requires that three * conditions be met: * 1. An event mask must have been passed. If a plain wakeup was done * instead, then mask == 0 and we have to call vfs_poll() to get * the events, so inline completion isn't possible. * 2. The completion work must not have already been scheduled. * 3. ctx_lock must not be busy. We have to use trylock because we * already hold the waitqueue lock, so this inverts the normal * locking order. Use irqsave/irqrestore because not all * filesystems (e.g. fuse) call this function with IRQs disabled, * yet IRQs have to be disabled before ctx_lock is obtained. */ if (mask && !req->work_scheduled && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); if (iocb->ki_eventfd && !eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); } spin_unlock_irqrestore(&ctx->ctx_lock, flags); if (iocb) iocb_put(iocb); } else { /* * Schedule the completion work if needed. If it was already * scheduled, record that another wakeup came in. * * Don't remove the request from the waitqueue here, as it might * not actually be complete yet (we won't know until vfs_poll() * is called), and we must not miss any wakeups. POLLFREE is an * exception to this; see below. */ if (req->work_scheduled) { req->work_need_resched = true; } else { schedule_work(&req->work); req->work_scheduled = true; } /* * If the waitqueue is being freed early but we can't complete * the request inline, we have to tear down the request as best * we can. That means immediately removing the request from its * waitqueue and preventing all further accesses to the * waitqueue via the request. We also need to schedule the * completion work (done above). Also mark the request as * cancelled, to potentially skip an unneeded call to ->poll(). */ if (mask & POLLFREE) { WRITE_ONCE(req->cancelled, true); list_del_init(&req->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&req->head, NULL); } } return 1; } struct aio_poll_table { struct poll_table_struct pt; struct aio_kiocb *iocb; bool queued; int error; }; static void aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt); /* multiple wait queues per file are not supported */ if (unlikely(pt->queued)) { pt->error = -EINVAL; return; } pt->queued = true; pt->error = 0; pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); } static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ if ((u16)iocb->aio_buf != iocb->aio_buf) return -EINVAL; /* reject fields that are not defined for poll */ if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) return -EINVAL; INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; req->cancelled = false; req->work_scheduled = false; req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; apt.queued = false; apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); mask = vfs_poll(req->file, &apt.pt) & req->events; spin_lock_irq(&ctx->ctx_lock); if (likely(apt.queued)) { bool on_queue = poll_iocb_lock_wq(req); if (!on_queue || req->work_scheduled) { /* * aio_poll_wake() already either scheduled the async * completion work, or completed the request inline. */ if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); } else if (on_queue) { /* * Actually waiting for an event, so add the request to * active_reqs so that it can be cancelled if needed. */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } if (on_queue) poll_iocb_unlock_wq(req); } if (mask) { /* no async, we'd stolen it */ aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; } spin_unlock_irq(&ctx->ctx_lock); if (mask) iocb_put(aiocb); return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, struct iocb __user *user_iocb, struct aio_kiocb *req, bool compat) { req->ki_filp = fget(iocb->aio_fildes); if (unlikely(!req->ki_filp)) return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ eventfd = eventfd_ctx_fdget(iocb->aio_resfd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); req->ki_eventfd = eventfd; } if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); return -EFAULT; } req->ki_res.obj = (u64)(unsigned long)user_iocb; req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); return -EINVAL; } } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; struct iocb iocb; int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; /* enforce forwards compatibility on users */ if (unlikely(iocb.aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( (iocb.aio_buf != (unsigned long)iocb.aio_buf) || (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || ((ssize_t)iocb.aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } req = aio_get_req(ctx); if (unlikely(!req)) return -EAGAIN; err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); /* Done with the synchronous reference */ iocb_put(req); /* * If err is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ if (unlikely(err)) { iocb_destroy(req); put_reqs_available(ctx, 1); } return err; } /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context * specified by ctx_id is invalid, if nr is < 0, if the iocb at * *iocbpp[0] is not properly initialized, if the operation specified * is invalid for the file descriptor in the iocb. May fail with * -EFAULT if any of the data structures point to invalid data. May * fail with -EBADF if the file descriptor specified in the first * iocb is invalid. May fail with -EAGAIN if insufficient resources * are available to queue any iocbs. Will return 0 if nr is 0. Will * fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct iocb __user * __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, user_iocb, false); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, int, nr, compat_uptr_t __user *, iocbpp) { struct kioctx *ctx; long ret = 0; int i = 0; struct blk_plug plug; if (unlikely(nr < 0)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } if (nr > ctx->nr_events) nr = ctx->nr_events; if (nr > AIO_PLUG_THRESHOLD) blk_start_plug(&plug); for (i = 0; i < nr; i++) { compat_uptr_t user_iocb; if (unlikely(get_user(user_iocb, iocbpp + i))) { ret = -EFAULT; break; } ret = io_submit_one(ctx, compat_ptr(user_iocb), true); if (ret) break; } if (nr > AIO_PLUG_THRESHOLD) blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; } #endif /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is * copied into the memory pointed to by result without being placed * into the completion queue and 0 is returned. May fail with * -EFAULT if any of the data structures pointed to are invalid. * May fail with -EINVAL if aio_context specified by ctx_id is * invalid. May fail with -EAGAIN if the iocb specified was not * cancelled. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; if (unlikely(key != KIOCB_KEY)) return -EINVAL; ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) return -EINVAL; spin_lock_irq(&ctx->ctx_lock); list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break; } } spin_unlock_irq(&ctx->ctx_lock); if (!ret) { /* * The result argument is no longer used - the io_event is * always delivered via the ring buffer. -EINPROGRESS indicates * cancellation is progress: */ ret = -EINPROGRESS; } percpu_ref_put(&ctx->users); return ret; } static long do_io_getevents(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct timespec64 *ts) { ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; struct kioctx *ioctx = lookup_ioctx(ctx_id); long ret = -EINVAL; if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, until); percpu_ref_put(&ioctx->users); } return ret; } /* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If * it succeeds, the number of read events is returned. May fail with * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is * out of range, if timeout is out of range. May fail with -EFAULT * if any of the memory specified is invalid. May return 0 or * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ #ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout) { struct timespec64 ts; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif struct __aio_sigset { const sigset_t __user *sigmask; size_t sigsetsize; }; SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_timespec64(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(io_pgetevents_time32, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; struct timespec64 ts; bool interrupted; int ret; if (timeout && unlikely(get_old_timespec32(&ts, timeout))) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, __s32, min_nr, __s32, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); if (!ret && signal_pending(current)) ret = -EINTR; return ret; } #endif #ifdef CONFIG_COMPAT struct __compat_aio_sigset { compat_uptr_t sigmask; compat_size_t sigsetsize; }; #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_old_timespec32(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, struct io_event __user *, events, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; if (timeout && get_timespec64(&t, timeout)) return -EFAULT; if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; return ret; } #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * * Error mapping routines from Samba libsmb/errormap.c * Copyright (C) Andrew Tridgell 2001 */ #include <linux/net.h> #include <linux/string.h> #include <linux/in.h> #include <linux/ctype.h> #include <linux/fs.h> #include <asm/div64.h> #include <asm/byteorder.h> #include <linux/inet.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smberr.h" #include "cifs_debug.h" #include "nterr.h" struct smb_to_posix_error { __u16 smb_err; int posix_code; }; static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRbadfunc, -EINVAL}, {ERRbadfile, -ENOENT}, {ERRbadpath, -ENOTDIR}, {ERRnofids, -EMFILE}, {ERRnoaccess, -EACCES}, {ERRbadfid, -EBADF}, {ERRbadmcb, -EIO}, {ERRnomem, -EREMOTEIO}, {ERRbadmem, -EFAULT}, {ERRbadenv, -EFAULT}, {ERRbadformat, -EINVAL}, {ERRbadaccess, -EACCES}, {ERRbaddata, -EIO}, {ERRbaddrive, -ENXIO}, {ERRremcd, -EACCES}, {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, {ERRfilexists, -EEXIST}, {ERRinvparm, -EINVAL}, {ERRdiskfull, -ENOSPC}, {ERRinvname, -ENOENT}, {ERRinvlevel, -EOPNOTSUPP}, {ERRdirnotempty, -ENOTEMPTY}, {ERRnotlocked, -ENOLCK}, {ERRcancelviolation, -ENOLCK}, {ERRalreadyexists, -EEXIST}, {ERRmoredata, -EOVERFLOW}, {ERReasnotsupported, -EOPNOTSUPP}, {ErrQuota, -EDQUOT}, {ErrNotALink, -ENOLINK}, {ERRnetlogonNotStarted, -ENOPROTOOPT}, {ERRsymlink, -EOPNOTSUPP}, {ErrTooManyLinks, -EMLINK}, {0, 0} }; static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {ERRerror, -EIO}, {ERRbadpw, -EACCES}, /* was EPERM */ {ERRbadtype, -EREMOTE}, {ERRaccess, -EACCES}, {ERRinvtid, -ENXIO}, {ERRinvnetname, -ENXIO}, {ERRinvdevice, -ENXIO}, {ERRqfull, -ENOSPC}, {ERRqtoobig, -ENOSPC}, {ERRqeof, -EIO}, {ERRinvpfid, -EBADF}, {ERRsmbcmd, -EBADRQC}, {ERRsrverror, -EIO}, {ERRbadBID, -EIO}, {ERRfilespecs, -EINVAL}, {ERRbadLink, -EIO}, {ERRbadpermits, -EINVAL}, {ERRbadPID, -ESRCH}, {ERRsetattrmode, -EINVAL}, {ERRpaused, -EHOSTDOWN}, {ERRmsgoff, -EHOSTDOWN}, {ERRnoroom, -ENOSPC}, {ERRrmuns, -EUSERS}, {ERRtimeout, -ETIME}, {ERRnoresource, -EREMOTEIO}, {ERRtoomanyuids, -EUSERS}, {ERRbaduid, -EACCES}, {ERRusempx, -EIO}, {ERRusestd, -EIO}, {ERR_NOTIFY_ENUM_DIR, -ENOBUFS}, {ERRnoSuchUser, -EACCES}, /* {ERRaccountexpired, -EACCES}, {ERRbadclient, -EACCES}, {ERRbadLogonTime, -EACCES}, {ERRpasswordExpired, -EACCES},*/ {ERRaccountexpired, -EKEYEXPIRED}, {ERRbadclient, -EACCES}, {ERRbadLogonTime, -EACCES}, {ERRpasswordExpired, -EKEYEXPIRED}, {ERRnosupport, -EINVAL}, {0, 0} }; /* * Convert a string containing text IPv4 or IPv6 address to binary form. * * Returns 0 on failure. */ static int cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) { int ret = 0; /* calculate length by finding first slash or NULL */ if (address_family == AF_INET) ret = in4_pton(cp, len, dst, '\\', NULL); else if (address_family == AF_INET6) ret = in6_pton(cp, len, dst , '\\', NULL); cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n", ret, len, len, cp); if (ret > 0) ret = 1; return ret; } /* * Try to convert a string to an IPv4 address and then attempt to convert * it to an IPv6 address if that fails. Set the family field if either * succeeds. If it's an IPv6 address and it has a '%' sign in it, try to * treat the part following it as a numeric sin6_scope_id. * * Returns 0 on failure. */ int cifs_convert_address(struct sockaddr *dst, const char *src, int len) { int rc, alen, slen; const char *pct; char scope_id[13]; struct sockaddr_in *s4 = (struct sockaddr_in *) dst; struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; /* IPv4 address */ if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { s4->sin_family = AF_INET; return 1; } /* attempt to exclude the scope ID from the address part */ pct = memchr(src, '%', len); alen = pct ? pct - src : len; rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); if (!rc) return rc; s6->sin6_family = AF_INET6; if (pct) { /* grab the scope ID */ slen = len - (alen + 1); if (slen <= 0 || slen > 12) return 0; memcpy(scope_id, pct + 1, slen); scope_id[slen] = '\0'; rc = kstrtouint(scope_id, 0, &s6->sin6_scope_id); rc = (rc == 0) ? 1 : 0; } return rc; } void cifs_set_port(struct sockaddr *addr, const unsigned short int port) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *)addr)->sin_port = htons(port); break; case AF_INET6: ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); break; } } /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ /* NT status -> dos error map */ static const struct { __u8 dos_class; __u16 dos_code; __u32 ntstatus; } ntstatus_to_dos_map[] = { { ERRDOS, ERRgeneral, NT_STATUS_UNSUCCESSFUL}, { ERRDOS, ERRbadfunc, NT_STATUS_NOT_IMPLEMENTED}, { ERRDOS, ERRinvlevel, NT_STATUS_INVALID_INFO_CLASS}, { ERRDOS, 24, NT_STATUS_INFO_LENGTH_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_ACCESS_VIOLATION}, { ERRHRD, ERRgeneral, NT_STATUS_IN_PAGE_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA}, { ERRDOS, ERRbadfid, NT_STATUS_INVALID_HANDLE}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_INITIAL_STACK}, { ERRDOS, 193, NT_STATUS_BAD_INITIAL_PC}, { ERRDOS, 87, NT_STATUS_INVALID_CID}, { ERRHRD, ERRgeneral, NT_STATUS_TIMER_NOT_CANCELED}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER}, { ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_DEVICE}, { ERRDOS, ERRbadfile, NT_STATUS_NO_SUCH_FILE}, { ERRDOS, ERRbadfunc, NT_STATUS_INVALID_DEVICE_REQUEST}, { ERRDOS, 38, NT_STATUS_END_OF_FILE}, { ERRDOS, 34, NT_STATUS_WRONG_VOLUME}, { ERRDOS, 21, NT_STATUS_NO_MEDIA_IN_DEVICE}, { ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_MEDIA}, { ERRDOS, 27, NT_STATUS_NONEXISTENT_SECTOR}, /* { This NT error code was 'sqashed' from NT_STATUS_MORE_PROCESSING_REQUIRED to NT_STATUS_OK during the session setup } */ { ERRDOS, ERRnomem, NT_STATUS_NO_MEMORY}, { ERRDOS, 487, NT_STATUS_CONFLICTING_ADDRESSES}, { ERRDOS, 487, NT_STATUS_NOT_MAPPED_VIEW}, { ERRDOS, 87, NT_STATUS_UNABLE_TO_FREE_VM}, { ERRDOS, 87, NT_STATUS_UNABLE_TO_DELETE_SECTION}, { ERRDOS, 2142, NT_STATUS_INVALID_SYSTEM_SERVICE}, { ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_INSTRUCTION}, { ERRDOS, ERRnoaccess, NT_STATUS_INVALID_LOCK_SEQUENCE}, { ERRDOS, ERRnoaccess, NT_STATUS_INVALID_VIEW_SIZE}, { ERRDOS, 193, NT_STATUS_INVALID_FILE_FOR_SECTION}, { ERRDOS, ERRnoaccess, NT_STATUS_ALREADY_COMMITTED}, /* { This NT error code was 'sqashed' from NT_STATUS_ACCESS_DENIED to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_ACCESS_DENIED}, { ERRDOS, 111, NT_STATUS_BUFFER_TOO_SMALL}, { ERRDOS, ERRbadfid, NT_STATUS_OBJECT_TYPE_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_NONCONTINUABLE_EXCEPTION}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_DISPOSITION}, { ERRHRD, ERRgeneral, NT_STATUS_UNWIND}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_STACK}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_UNWIND_TARGET}, { ERRDOS, 158, NT_STATUS_NOT_LOCKED}, { ERRHRD, ERRgeneral, NT_STATUS_PARITY_ERROR}, { ERRDOS, 487, NT_STATUS_UNABLE_TO_DECOMMIT_VM}, { ERRDOS, 487, NT_STATUS_NOT_COMMITTED}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_PORT_ATTRIBUTES}, { ERRHRD, ERRgeneral, NT_STATUS_PORT_MESSAGE_TOO_LONG}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, { /* mapping changed since shell does lookup on * expects FileNotFound */ ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, { ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, { ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, { ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, { ERRDOS, ERRbadfid, NT_STATUS_PORT_DISCONNECTED}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_ALREADY_ATTACHED}, { ERRDOS, 161, NT_STATUS_OBJECT_PATH_INVALID}, { ERRDOS, ERRbadpath, NT_STATUS_OBJECT_PATH_NOT_FOUND}, { ERRDOS, 161, NT_STATUS_OBJECT_PATH_SYNTAX_BAD}, { ERRHRD, ERRgeneral, NT_STATUS_DATA_OVERRUN}, { ERRHRD, ERRgeneral, NT_STATUS_DATA_LATE_ERROR}, { ERRDOS, 23, NT_STATUS_DATA_ERROR}, { ERRDOS, 23, NT_STATUS_CRC_ERROR}, { ERRDOS, ERRnomem, NT_STATUS_SECTION_TOO_BIG}, { ERRDOS, ERRnoaccess, NT_STATUS_PORT_CONNECTION_REFUSED}, { ERRDOS, ERRbadfid, NT_STATUS_INVALID_PORT_HANDLE}, { ERRDOS, ERRbadshare, NT_STATUS_SHARING_VIOLATION}, { ERRHRD, ERRgeneral, NT_STATUS_QUOTA_EXCEEDED}, { ERRDOS, 87, NT_STATUS_INVALID_PAGE_PROTECTION}, { ERRDOS, 288, NT_STATUS_MUTANT_NOT_OWNED}, { ERRDOS, 298, NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED}, { ERRDOS, 87, NT_STATUS_PORT_ALREADY_SET}, { ERRDOS, 87, NT_STATUS_SECTION_NOT_IMAGE}, { ERRDOS, 156, NT_STATUS_SUSPEND_COUNT_EXCEEDED}, { ERRDOS, ERRnoaccess, NT_STATUS_THREAD_IS_TERMINATING}, { ERRDOS, 87, NT_STATUS_BAD_WORKING_SET_LIMIT}, { ERRDOS, 87, NT_STATUS_INCOMPATIBLE_FILE_MAP}, { ERRDOS, 87, NT_STATUS_SECTION_PROTECTION}, { ERRDOS, ERReasnotsupported, NT_STATUS_EAS_NOT_SUPPORTED}, { ERRDOS, 255, NT_STATUS_EA_TOO_LARGE}, { ERRHRD, ERRgeneral, NT_STATUS_NONEXISTENT_EA_ENTRY}, { ERRHRD, ERRgeneral, NT_STATUS_NO_EAS_ON_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_EA_CORRUPT_ERROR}, { ERRDOS, ERRlock, NT_STATUS_FILE_LOCK_CONFLICT}, { ERRDOS, ERRlock, NT_STATUS_LOCK_NOT_GRANTED}, { ERRDOS, ERRbadfile, NT_STATUS_DELETE_PENDING}, { ERRDOS, ERRunsup, NT_STATUS_CTL_FILE_NOT_SUPPORTED}, { ERRHRD, ERRgeneral, NT_STATUS_UNKNOWN_REVISION}, { ERRHRD, ERRgeneral, NT_STATUS_REVISION_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_OWNER}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_PRIMARY_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_NO_IMPERSONATION_TOKEN}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_DISABLE_MANDATORY}, { ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, { ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, { ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS}, /* { This NT error code was 'sqashed' from NT_STATUS_NO_SUCH_USER to NT_STATUS_LOGON_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_NO_SUCH_USER}, { /* could map to 2238 */ ERRHRD, ERRgeneral, NT_STATUS_GROUP_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_LAST_ADMIN}, /* { This NT error code was 'sqashed' from NT_STATUS_WRONG_PASSWORD to NT_STATUS_LOGON_FAILURE during the session setup } */ { ERRSRV, ERRbadpw, NT_STATUS_WRONG_PASSWORD}, { ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_PASSWORD}, { ERRHRD, ERRgeneral, NT_STATUS_PASSWORD_RESTRICTION}, { ERRDOS, ERRnoaccess, NT_STATUS_LOGON_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_ACCOUNT_RESTRICTION}, { ERRSRV, ERRbadLogonTime, NT_STATUS_INVALID_LOGON_HOURS}, { ERRSRV, ERRbadclient, NT_STATUS_INVALID_WORKSTATION}, { ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_EXPIRED}, { ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_DISABLED}, { ERRHRD, ERRgeneral, NT_STATUS_NONE_MAPPED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_LUIDS_REQUESTED}, { ERRHRD, ERRgeneral, NT_STATUS_LUIDS_EXHAUSTED}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SUB_AUTHORITY}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACL}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SID}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SECURITY_DESCR}, { ERRDOS, 127, NT_STATUS_PROCEDURE_NOT_FOUND}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_FORMAT}, { ERRHRD, ERRgeneral, NT_STATUS_NO_TOKEN}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_INHERITANCE_ACL}, { ERRDOS, 158, NT_STATUS_RANGE_NOT_LOCKED}, { ERRDOS, 112, NT_STATUS_DISK_FULL}, { ERRHRD, ERRgeneral, NT_STATUS_SERVER_DISABLED}, { ERRHRD, ERRgeneral, NT_STATUS_SERVER_NOT_DISABLED}, { ERRDOS, 68, NT_STATUS_TOO_MANY_GUIDS_REQUESTED}, { ERRDOS, 259, NT_STATUS_GUIDS_EXHAUSTED}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ID_AUTHORITY}, { ERRDOS, 259, NT_STATUS_AGENTS_EXHAUSTED}, { ERRDOS, 154, NT_STATUS_INVALID_VOLUME_LABEL}, { ERRDOS, 14, NT_STATUS_SECTION_NOT_EXTENDED}, { ERRDOS, 487, NT_STATUS_NOT_MAPPED_DATA}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_DATA_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_TYPE_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_NAME_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_ARRAY_BOUNDS_EXCEEDED}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DENORMAL_OPERAND}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_DIVIDE_BY_ZERO}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INEXACT_RESULT}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_INVALID_OPERATION}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_STACK_CHECK}, { ERRHRD, ERRgeneral, NT_STATUS_FLOAT_UNDERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_INTEGER_DIVIDE_BY_ZERO}, { ERRDOS, 534, NT_STATUS_INTEGER_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_PRIVILEGED_INSTRUCTION}, { ERRDOS, ERRnomem, NT_STATUS_TOO_MANY_PAGING_FILES}, { ERRHRD, ERRgeneral, NT_STATUS_FILE_INVALID}, { ERRHRD, ERRgeneral, NT_STATUS_ALLOTTED_SPACE_EXCEEDED}, /* { This NT error code was 'sqashed' from NT_STATUS_INSUFFICIENT_RESOURCES to NT_STATUS_INSUFF_SERVER_RESOURCES during the session setup } */ { ERRDOS, ERRnoresource, NT_STATUS_INSUFFICIENT_RESOURCES}, { ERRDOS, ERRbadpath, NT_STATUS_DFS_EXIT_PATH_FOUND}, { ERRDOS, 23, NT_STATUS_DEVICE_DATA_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_CONNECTED}, { ERRDOS, 21, NT_STATUS_DEVICE_POWER_FAILURE}, { ERRDOS, 487, NT_STATUS_FREE_VM_NOT_AT_BASE}, { ERRDOS, 487, NT_STATUS_MEMORY_NOT_ALLOCATED}, { ERRHRD, ERRgeneral, NT_STATUS_WORKING_SET_QUOTA}, { ERRDOS, 19, NT_STATUS_MEDIA_WRITE_PROTECTED}, { ERRDOS, 21, NT_STATUS_DEVICE_NOT_READY}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_GROUP_ATTRIBUTES}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_IMPERSONATION_LEVEL}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_OPEN_ANONYMOUS}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_VALIDATION_CLASS}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_TOKEN_TYPE}, { ERRDOS, 87, NT_STATUS_BAD_MASTER_BOOT_RECORD}, { ERRHRD, ERRgeneral, NT_STATUS_INSTRUCTION_MISALIGNMENT}, { ERRDOS, ERRpipebusy, NT_STATUS_INSTANCE_NOT_AVAILABLE}, { ERRDOS, ERRpipebusy, NT_STATUS_PIPE_NOT_AVAILABLE}, { ERRDOS, ERRbadpipe, NT_STATUS_INVALID_PIPE_STATE}, { ERRDOS, ERRpipebusy, NT_STATUS_PIPE_BUSY}, { ERRDOS, ERRbadfunc, NT_STATUS_ILLEGAL_FUNCTION}, { ERRDOS, ERRnotconnected, NT_STATUS_PIPE_DISCONNECTED}, { ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_CLOSING}, { ERRHRD, ERRgeneral, NT_STATUS_PIPE_CONNECTED}, { ERRHRD, ERRgeneral, NT_STATUS_PIPE_LISTENING}, { ERRDOS, ERRbadpipe, NT_STATUS_INVALID_READ_MODE}, { ERRDOS, 121, NT_STATUS_IO_TIMEOUT}, { ERRDOS, 38, NT_STATUS_FILE_FORCED_CLOSED}, { ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STARTED}, { ERRHRD, ERRgeneral, NT_STATUS_PROFILING_NOT_STOPPED}, { ERRHRD, ERRgeneral, NT_STATUS_COULD_NOT_INTERPRET}, { ERRDOS, ERRnoaccess, NT_STATUS_FILE_IS_A_DIRECTORY}, { ERRDOS, ERRunsup, NT_STATUS_NOT_SUPPORTED}, { ERRDOS, 51, NT_STATUS_REMOTE_NOT_LISTENING}, { ERRDOS, 52, NT_STATUS_DUPLICATE_NAME}, { ERRDOS, 53, NT_STATUS_BAD_NETWORK_PATH}, { ERRDOS, 54, NT_STATUS_NETWORK_BUSY}, { ERRDOS, 55, NT_STATUS_DEVICE_DOES_NOT_EXIST}, { ERRDOS, 56, NT_STATUS_TOO_MANY_COMMANDS}, { ERRDOS, 57, NT_STATUS_ADAPTER_HARDWARE_ERROR}, { ERRDOS, 58, NT_STATUS_INVALID_NETWORK_RESPONSE}, { ERRDOS, 59, NT_STATUS_UNEXPECTED_NETWORK_ERROR}, { ERRDOS, 60, NT_STATUS_BAD_REMOTE_ADAPTER}, { ERRDOS, 61, NT_STATUS_PRINT_QUEUE_FULL}, { ERRDOS, 62, NT_STATUS_NO_SPOOL_SPACE}, { ERRDOS, 63, NT_STATUS_PRINT_CANCELLED}, { ERRDOS, 64, NT_STATUS_NETWORK_NAME_DELETED}, { ERRDOS, 65, NT_STATUS_NETWORK_ACCESS_DENIED}, { ERRDOS, 66, NT_STATUS_BAD_DEVICE_TYPE}, { ERRDOS, ERRnosuchshare, NT_STATUS_BAD_NETWORK_NAME}, { ERRDOS, 68, NT_STATUS_TOO_MANY_NAMES}, { ERRDOS, 69, NT_STATUS_TOO_MANY_SESSIONS}, { ERRDOS, 70, NT_STATUS_SHARING_PAUSED}, { ERRDOS, 71, NT_STATUS_REQUEST_NOT_ACCEPTED}, { ERRDOS, 72, NT_STATUS_REDIRECTOR_PAUSED}, { ERRDOS, 88, NT_STATUS_NET_WRITE_FAULT}, { ERRHRD, ERRgeneral, NT_STATUS_PROFILING_AT_LIMIT}, { ERRDOS, ERRdiffdevice, NT_STATUS_NOT_SAME_DEVICE}, { ERRDOS, ERRnoaccess, NT_STATUS_FILE_RENAMED}, { ERRDOS, 240, NT_STATUS_VIRTUAL_CIRCUIT_CLOSED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SECURITY_ON_OBJECT}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_WAIT}, { ERRDOS, ERRpipeclosing, NT_STATUS_PIPE_EMPTY}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_ACCESS_DOMAIN_INFO}, { ERRHRD, ERRgeneral, NT_STATUS_CANT_TERMINATE_SELF}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_SERVER_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_DOMAIN_ROLE}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_DOMAIN}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_LIMIT_EXCEEDED}, { ERRDOS, 300, NT_STATUS_OPLOCK_NOT_GRANTED}, { ERRDOS, 301, NT_STATUS_INVALID_OPLOCK_PROTOCOL}, { ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_CORRUPTION}, { ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_GENERIC_NOT_MAPPED}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_DESCRIPTOR_FORMAT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_USER_BUFFER}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_IO_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_CREATE_ERR}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_MAP_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_UNEXPECTED_MM_EXTEND_ERR}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_LOGON_PROCESS}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_EXISTS}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_1}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_2}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_3}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_4}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_5}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_6}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_7}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_8}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_9}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_10}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_11}, { ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_12}, { ERRDOS, ERRbadpath, NT_STATUS_REDIRECTOR_NOT_STARTED}, { ERRHRD, ERRgeneral, NT_STATUS_REDIRECTOR_STARTED}, { ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PACKAGE}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_FUNCTION_TABLE}, { ERRDOS, 203, 0xc0000100}, { ERRDOS, 145, NT_STATUS_DIRECTORY_NOT_EMPTY}, { ERRHRD, ERRgeneral, NT_STATUS_FILE_CORRUPT_ERROR}, { ERRDOS, 267, NT_STATUS_NOT_A_DIRECTORY}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_LOGON_SESSION_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_SESSION_COLLISION}, { ERRDOS, 206, NT_STATUS_NAME_TOO_LONG}, { ERRDOS, 2401, NT_STATUS_FILES_OPEN}, { ERRDOS, 2404, NT_STATUS_CONNECTION_IN_USE}, { ERRHRD, ERRgeneral, NT_STATUS_MESSAGE_NOT_FOUND}, { ERRDOS, ERRnoaccess, NT_STATUS_PROCESS_IS_TERMINATING}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LOGON_TYPE}, { ERRHRD, ERRgeneral, NT_STATUS_NO_GUID_TRANSLATION}, { ERRHRD, ERRgeneral, NT_STATUS_CANNOT_IMPERSONATE}, { ERRHRD, ERRgeneral, NT_STATUS_IMAGE_ALREADY_LOADED}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_PRESENT}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_NOT_EXIST}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_LID_ALREADY_OWNED}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_NOT_LID_OWNER}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_COMMAND}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_LID}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE}, { ERRHRD, ERRgeneral, NT_STATUS_ABIOS_INVALID_SELECTOR}, { ERRHRD, ERRgeneral, NT_STATUS_NO_LDT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_SIZE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_OFFSET}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_LDT_DESCRIPTOR}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NE_FORMAT}, { ERRHRD, ERRgeneral, NT_STATUS_RXACT_INVALID_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_RXACT_COMMIT_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_MAPPED_FILE_SIZE_ZERO}, { ERRDOS, ERRnofids, NT_STATUS_TOO_MANY_OPENED_FILES}, { ERRHRD, ERRgeneral, NT_STATUS_CANCELLED}, { ERRDOS, ERRnoaccess, NT_STATUS_CANNOT_DELETE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_COMPUTER_NAME}, { ERRDOS, ERRnoaccess, NT_STATUS_FILE_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_ACCOUNT}, { ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_GROUP}, { ERRHRD, ERRgeneral, NT_STATUS_SPECIAL_USER}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBERS_PRIMARY_GROUP}, { ERRDOS, ERRbadfid, NT_STATUS_FILE_CLOSED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_THREADS}, { ERRHRD, ERRgeneral, NT_STATUS_THREAD_NOT_IN_PROCESS}, { ERRHRD, ERRgeneral, NT_STATUS_TOKEN_ALREADY_IN_USE}, { ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_QUOTA_EXCEEDED}, { ERRHRD, ERRgeneral, NT_STATUS_COMMITMENT_LIMIT}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_LE_FORMAT}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_NOT_MZ}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_PROTECT}, { ERRDOS, 193, NT_STATUS_INVALID_IMAGE_WIN_16}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_SERVER_CONFLICT}, { ERRHRD, ERRgeneral, NT_STATUS_TIME_DIFFERENCE_AT_DC}, { ERRHRD, ERRgeneral, NT_STATUS_SYNCHRONIZATION_REQUIRED}, { ERRDOS, 126, NT_STATUS_DLL_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_OPEN_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_IO_PRIVILEGE_FAILED}, { ERRDOS, 182, NT_STATUS_ORDINAL_NOT_FOUND}, { ERRDOS, 127, NT_STATUS_ENTRYPOINT_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_CONTROL_C_EXIT}, { ERRDOS, 64, NT_STATUS_LOCAL_DISCONNECT}, { ERRDOS, 64, NT_STATUS_REMOTE_DISCONNECT}, { ERRDOS, 51, NT_STATUS_REMOTE_RESOURCES}, { ERRDOS, 59, NT_STATUS_LINK_FAILED}, { ERRDOS, 59, NT_STATUS_LINK_TIMEOUT}, { ERRDOS, 59, NT_STATUS_INVALID_CONNECTION}, { ERRDOS, 59, NT_STATUS_INVALID_ADDRESS}, { ERRHRD, ERRgeneral, NT_STATUS_DLL_INIT_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_MISSING_SYSTEMFILE}, { ERRHRD, ERRgeneral, NT_STATUS_UNHANDLED_EXCEPTION}, { ERRHRD, ERRgeneral, NT_STATUS_APP_INIT_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_PAGEFILE_CREATE_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_PAGEFILE}, { ERRDOS, 124, NT_STATUS_INVALID_LEVEL}, { ERRDOS, 86, NT_STATUS_WRONG_PASSWORD_CORE}, { ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_FLOAT_CONTEXT}, { ERRDOS, 109, NT_STATUS_PIPE_BROKEN}, { ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_CORRUPT}, { ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_IO_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_EVENT_PAIR}, { ERRHRD, ERRgeneral, NT_STATUS_UNRECOGNIZED_VOLUME}, { ERRHRD, ERRgeneral, NT_STATUS_SERIAL_NO_DEVICE_INITED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_ALIAS}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_NOT_IN_ALIAS}, { ERRHRD, ERRgeneral, NT_STATUS_MEMBER_IN_ALIAS}, { ERRHRD, ERRgeneral, NT_STATUS_ALIAS_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_LOGON_NOT_GRANTED}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SECRETS}, { ERRHRD, ERRgeneral, NT_STATUS_SECRET_TOO_LONG}, { ERRHRD, ERRgeneral, NT_STATUS_INTERNAL_DB_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_FULLSCREEN_MODE}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_CONTEXT_IDS}, { ERRDOS, ERRnoaccess, NT_STATUS_LOGON_TYPE_NOT_GRANTED}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_REGISTRY_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_FT_MISSING_MEMBER}, { ERRHRD, ERRgeneral, NT_STATUS_ILL_FORMED_SERVICE_ENTRY}, { ERRHRD, ERRgeneral, NT_STATUS_ILLEGAL_CHARACTER}, { ERRHRD, ERRgeneral, NT_STATUS_UNMAPPABLE_CHARACTER}, { ERRHRD, ERRgeneral, NT_STATUS_UNDEFINED_CHARACTER}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_VOLUME}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_WRONG_CYLINDER}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_UNKNOWN_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_FLOPPY_BAD_REGISTERS}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_RECALIBRATE_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_OPERATION_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_DISK_RESET_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_SHARED_IRQ_BUSY}, { ERRHRD, ERRgeneral, NT_STATUS_FT_ORPHANING}, { ERRHRD, ERRgeneral, 0xc000016e}, { ERRHRD, ERRgeneral, 0xc000016f}, { ERRHRD, ERRgeneral, 0xc0000170}, { ERRHRD, ERRgeneral, 0xc0000171}, { ERRHRD, ERRgeneral, NT_STATUS_PARTITION_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_BLOCK_LENGTH}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_NOT_PARTITIONED}, { ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_LOCK_MEDIA}, { ERRHRD, ERRgeneral, NT_STATUS_UNABLE_TO_UNLOAD_MEDIA}, { ERRHRD, ERRgeneral, NT_STATUS_EOM_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_NO_MEDIA}, { ERRHRD, ERRgeneral, 0xc0000179}, { ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_MEMBER}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_MEMBER}, { ERRHRD, ERRgeneral, NT_STATUS_KEY_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_LOG_SPACE}, { ERRHRD, ERRgeneral, NT_STATUS_TOO_MANY_SIDS}, { ERRHRD, ERRgeneral, NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED}, { ERRHRD, ERRgeneral, NT_STATUS_KEY_HAS_CHILDREN}, { ERRHRD, ERRgeneral, NT_STATUS_CHILD_MUST_BE_VOLATILE}, { ERRDOS, 87, NT_STATUS_DEVICE_CONFIGURATION_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DRIVER_INTERNAL_ERROR}, { ERRDOS, 22, NT_STATUS_INVALID_DEVICE_STATE}, { ERRHRD, ERRgeneral, NT_STATUS_IO_DEVICE_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DEVICE_PROTOCOL_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_BACKUP_CONTROLLER}, { ERRHRD, ERRgeneral, NT_STATUS_LOG_FILE_FULL}, { ERRDOS, 19, NT_STATUS_TOO_LATE}, { ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_LSA_SECRET}, /* { This NT error code was 'sqashed' from NT_STATUS_NO_TRUST_SAM_ACCOUNT to NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_NO_TRUST_SAM_ACCOUNT}, { ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_DOMAIN_FAILURE}, { ERRDOS, ERRnoaccess, NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CORRUPT}, { ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_CANT_START}, { ERRDOS, ERRnoaccess, NT_STATUS_TRUST_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_MUTANT_LIMIT_EXCEEDED}, { ERRDOS, ERRnetlogonNotStarted, NT_STATUS_NETLOGON_NOT_STARTED}, { ERRSRV, ERRaccountexpired, NT_STATUS_ACCOUNT_EXPIRED}, { ERRHRD, ERRgeneral, NT_STATUS_POSSIBLE_DEADLOCK}, { ERRHRD, ERRgeneral, NT_STATUS_NETWORK_CREDENTIAL_CONFLICT}, { ERRHRD, ERRgeneral, NT_STATUS_REMOTE_SESSION_LIMIT}, { ERRHRD, ERRgeneral, NT_STATUS_EVENTLOG_FILE_CHANGED}, { ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT}, { ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT}, { ERRDOS, ERRnoaccess, NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT}, /* { This NT error code was 'sqashed' from NT_STATUS_DOMAIN_TRUST_INCONSISTENT to NT_STATUS_LOGON_FAILURE during the session setup } */ { ERRDOS, ERRnoaccess, NT_STATUS_DOMAIN_TRUST_INCONSISTENT}, { ERRHRD, ERRgeneral, NT_STATUS_FS_DRIVER_REQUIRED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_USER_SESSION_KEY}, { ERRDOS, 59, NT_STATUS_USER_SESSION_DELETED}, { ERRHRD, ERRgeneral, NT_STATUS_RESOURCE_LANG_NOT_FOUND}, { ERRDOS, ERRnoresource, NT_STATUS_INSUFF_SERVER_RESOURCES}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_BUFFER_SIZE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_COMPONENT}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_ADDRESS_WILDCARD}, { ERRDOS, 68, NT_STATUS_TOO_MANY_ADDRESSES}, { ERRDOS, 52, NT_STATUS_ADDRESS_ALREADY_EXISTS}, { ERRDOS, 64, NT_STATUS_ADDRESS_CLOSED}, { ERRDOS, 64, NT_STATUS_CONNECTION_DISCONNECTED}, { ERRDOS, 64, NT_STATUS_CONNECTION_RESET}, { ERRDOS, 68, NT_STATUS_TOO_MANY_NODES}, { ERRDOS, 59, NT_STATUS_TRANSACTION_ABORTED}, { ERRDOS, 59, NT_STATUS_TRANSACTION_TIMED_OUT}, { ERRDOS, 59, NT_STATUS_TRANSACTION_NO_RELEASE}, { ERRDOS, 59, NT_STATUS_TRANSACTION_NO_MATCH}, { ERRDOS, 59, NT_STATUS_TRANSACTION_RESPONDED}, { ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_ID}, { ERRDOS, 59, NT_STATUS_TRANSACTION_INVALID_TYPE}, { ERRDOS, ERRunsup, NT_STATUS_NOT_SERVER_SESSION}, { ERRDOS, ERRunsup, NT_STATUS_NOT_CLIENT_SESSION}, { ERRHRD, ERRgeneral, NT_STATUS_CANNOT_LOAD_REGISTRY_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_DEBUG_ATTACH_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_SYSTEM_PROCESS_TERMINATED}, { ERRHRD, ERRgeneral, NT_STATUS_DATA_NOT_ACCEPTED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_BROWSER_SERVERS_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_VDM_HARD_ERROR}, { ERRHRD, ERRgeneral, NT_STATUS_DRIVER_CANCEL_TIMEOUT}, { ERRHRD, ERRgeneral, NT_STATUS_REPLY_MESSAGE_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_MAPPED_ALIGNMENT}, { ERRDOS, 193, NT_STATUS_IMAGE_CHECKSUM_MISMATCH}, { ERRHRD, ERRgeneral, NT_STATUS_LOST_WRITEBEHIND_DATA}, { ERRHRD, ERRgeneral, NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID}, { ERRSRV, ERRpasswordExpired, NT_STATUS_PASSWORD_MUST_CHANGE}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_NOT_TINY_STREAM}, { ERRHRD, ERRgeneral, NT_STATUS_RECOVERY_FAILURE}, { ERRHRD, ERRgeneral, NT_STATUS_STACK_OVERFLOW_READ}, { ERRHRD, ERRgeneral, NT_STATUS_FAIL_CHECK}, { ERRHRD, ERRgeneral, NT_STATUS_DUPLICATE_OBJECTID}, { ERRHRD, ERRgeneral, NT_STATUS_OBJECTID_EXISTS}, { ERRHRD, ERRgeneral, NT_STATUS_CONVERT_TO_LARGE}, { ERRHRD, ERRgeneral, NT_STATUS_RETRY}, { ERRHRD, ERRgeneral, NT_STATUS_FOUND_OUT_OF_SCOPE}, { ERRHRD, ERRgeneral, NT_STATUS_ALLOCATE_BUCKET}, { ERRHRD, ERRgeneral, NT_STATUS_PROPSET_NOT_FOUND}, { ERRHRD, ERRgeneral, NT_STATUS_MARSHALL_OVERFLOW}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_VARIANT}, { ERRHRD, ERRgeneral, NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND}, { ERRDOS, ERRnoaccess, NT_STATUS_ACCOUNT_LOCKED_OUT}, { ERRDOS, ERRbadfid, NT_STATUS_HANDLE_NOT_CLOSABLE}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_REFUSED}, { ERRHRD, ERRgeneral, NT_STATUS_GRACEFUL_DISCONNECT}, { ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_ALREADY_ASSOCIATED}, { ERRHRD, ERRgeneral, NT_STATUS_ADDRESS_NOT_ASSOCIATED}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_INVALID}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ACTIVE}, { ERRHRD, ERRgeneral, NT_STATUS_NETWORK_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_HOST_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_PROTOCOL_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_PORT_UNREACHABLE}, { ERRHRD, ERRgeneral, NT_STATUS_REQUEST_ABORTED}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_ABORTED}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_COMPRESSION_BUFFER}, { ERRHRD, ERRgeneral, NT_STATUS_USER_MAPPED_FILE}, { ERRHRD, ERRgeneral, NT_STATUS_AUDIT_FAILED}, { ERRHRD, ERRgeneral, NT_STATUS_TIMER_RESOLUTION_NOT_SET}, { ERRHRD, ERRgeneral, NT_STATUS_CONNECTION_COUNT_LIMIT}, { ERRHRD, ERRgeneral, NT_STATUS_LOGIN_TIME_RESTRICTION}, { ERRHRD, ERRgeneral, NT_STATUS_LOGIN_WKSTA_RESTRICTION}, { ERRDOS, 193, NT_STATUS_IMAGE_MP_UP_MISMATCH}, { ERRHRD, ERRgeneral, 0xc000024a}, { ERRHRD, ERRgeneral, 0xc000024b}, { ERRHRD, ERRgeneral, 0xc000024c}, { ERRHRD, ERRgeneral, 0xc000024d}, { ERRHRD, ERRgeneral, 0xc000024e}, { ERRHRD, ERRgeneral, 0xc000024f}, { ERRHRD, ERRgeneral, NT_STATUS_INSUFFICIENT_LOGON_INFO}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_DLL_ENTRYPOINT}, { ERRHRD, ERRgeneral, NT_STATUS_BAD_SERVICE_ENTRYPOINT}, { ERRHRD, ERRgeneral, NT_STATUS_LPC_REPLY_LOST}, { ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT1}, { ERRHRD, ERRgeneral, NT_STATUS_IP_ADDRESS_CONFLICT2}, { ERRHRD, ERRgeneral, NT_STATUS_REGISTRY_QUOTA_LIMIT}, { ERRSRV, 3, NT_STATUS_PATH_NOT_COVERED}, { ERRHRD, ERRgeneral, NT_STATUS_NO_CALLBACK_ACTIVE}, { ERRHRD, ERRgeneral, NT_STATUS_LICENSE_QUOTA_EXCEEDED}, { ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_SHORT}, { ERRHRD, ERRgeneral, NT_STATUS_PWD_TOO_RECENT}, { ERRHRD, ERRgeneral, NT_STATUS_PWD_HISTORY_CONFLICT}, { ERRHRD, ERRgeneral, 0xc000025d}, { ERRHRD, ERRgeneral, NT_STATUS_PLUGPLAY_NO_DEVICE}, { ERRHRD, ERRgeneral, NT_STATUS_UNSUPPORTED_COMPRESSION}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_HW_PROFILE}, { ERRHRD, ERRgeneral, NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH}, { ERRDOS, 182, NT_STATUS_DRIVER_ORDINAL_NOT_FOUND}, { ERRDOS, 127, NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND}, { ERRDOS, 288, NT_STATUS_RESOURCE_NOT_OWNED}, { ERRDOS, ErrTooManyLinks, NT_STATUS_TOO_MANY_LINKS}, { ERRHRD, ERRgeneral, NT_STATUS_QUOTA_LIST_INCONSISTENT}, { ERRHRD, ERRgeneral, NT_STATUS_FILE_IS_OFFLINE}, { ERRDOS, 21, 0xc000026e}, { ERRDOS, 161, 0xc0000281}, { ERRDOS, ERRnoaccess, 0xc000028a}, { ERRDOS, ERRnoaccess, 0xc000028b}, { ERRHRD, ERRgeneral, 0xc000028c}, { ERRDOS, ERRnoaccess, 0xc000028d}, { ERRDOS, ERRnoaccess, 0xc000028e}, { ERRDOS, ERRnoaccess, 0xc000028f}, { ERRDOS, ERRnoaccess, 0xc0000290}, { ERRDOS, ERRbadfunc, 0xc000029c}, { ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { ERRDOS, ERRinvlevel, 0x007c0001}, { 0, 0, 0 } }; /***************************************************************************** Print an error message from the status code *****************************************************************************/ static void cifs_print_status(__u32 status_code) { int idx = 0; while (nt_errs[idx].nt_errstr != NULL) { if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) == (status_code & 0xFFFFFF)) { pr_notice("Status code returned 0x%08x %s\n", status_code, nt_errs[idx].nt_errstr); } idx++; } return; } static void ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode) { int i; if (ntstatus == 0) { *eclass = 0; *ecode = 0; return; } for (i = 0; ntstatus_to_dos_map[i].ntstatus; i++) { if (ntstatus == ntstatus_to_dos_map[i].ntstatus) { *eclass = ntstatus_to_dos_map[i].dos_class; *ecode = ntstatus_to_dos_map[i].dos_code; return; } } *eclass = ERRHRD; *ecode = ERRgeneral; } int map_smb_to_linux_error(char *buf, bool logErr) { struct smb_hdr *smb = (struct smb_hdr *)buf; unsigned int i; int rc = -EIO; /* if transport error smb error may not be set */ __u8 smberrclass; __u16 smberrcode; /* BB if NT Status codes - map NT BB */ /* old style smb error codes */ if (smb->Status.CifsError == 0) return 0; if (smb->Flags2 & SMBFLG2_ERR_STATUS) { /* translate the newer STATUS codes to old style SMB errors * and then to POSIX errors */ __u32 err = le32_to_cpu(smb->Status.CifsError); if (logErr && (err != (NT_STATUS_MORE_PROCESSING_REQUIRED))) cifs_print_status(err); else if (cifsFYI & CIFS_RC) cifs_print_status(err); ntstatus_to_dos(err, &smberrclass, &smberrcode); } else { smberrclass = smb->Status.DosError.ErrorClass; smberrcode = le16_to_cpu(smb->Status.DosError.Error); } /* old style errors */ /* DOS class smb error codes - map DOS */ if (smberrclass == ERRDOS) { /* 1 byte field no need to byte reverse */ for (i = 0; i < sizeof(mapping_table_ERRDOS) / sizeof(struct smb_to_posix_error); i++) { if (mapping_table_ERRDOS[i].smb_err == 0) break; else if (mapping_table_ERRDOS[i].smb_err == smberrcode) { rc = mapping_table_ERRDOS[i].posix_code; break; } /* else try next error mapping one to see if match */ } } else if (smberrclass == ERRSRV) { /* server class of error codes */ for (i = 0; i < sizeof(mapping_table_ERRSRV) / sizeof(struct smb_to_posix_error); i++) { if (mapping_table_ERRSRV[i].smb_err == 0) break; else if (mapping_table_ERRSRV[i].smb_err == smberrcode) { rc = mapping_table_ERRSRV[i].posix_code; break; } /* else try next error mapping to see if match */ } } /* else ERRHRD class errors or junk - return EIO */ cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n", le32_to_cpu(smb->Status.CifsError), rc); /* generic corrective action e.g. reconnect SMB session on * ERRbaduid could be added */ return rc; } int map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) { int rc; struct smb_hdr *smb = (struct smb_hdr *)mid->resp_buf; rc = map_smb_to_linux_error((char *)smb, logErr); if (rc == -EACCES && !(smb->Flags2 & SMBFLG2_ERR_STATUS)) { /* possible ERRBaduid */ __u8 class = smb->Status.DosError.ErrorClass; __u16 code = le16_to_cpu(smb->Status.DosError.Error); /* switch can be used to handle different errors */ if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); cifs_signal_cifsd_for_reconnect(mid->server, false); } } return rc; } /* * calculate the size of the SMB message based on the fixed header * portion, the number of word parameters and the data portion of the message */ unsigned int smbCalcSize(void *buf) { struct smb_hdr *ptr = buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 2 /* size of the bcc field */ + get_bcc(ptr)); } /* The following are taken from fs/ntfs/util.c */ #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) /* * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) * into Unix UTC (based 1970-01-01, in seconds). */ struct timespec64 cifs_NTtimeToUnix(__le64 ntutc) { struct timespec64 ts; /* BB what about the timezone? BB */ /* Subtract the NTFS time offset, then convert to 1s intervals. */ s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; u64 abs_t; /* * Unfortunately can not use normal 64 bit division on 32 bit arch, but * the alternative, do_div, does not work with negative numbers so have * to special case them */ if (t < 0) { abs_t = -t; ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100); ts.tv_nsec = -ts.tv_nsec; ts.tv_sec = -abs_t; } else { abs_t = t; ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100; ts.tv_sec = abs_t; } return ts; } /* Convert the Unix UTC into NT UTC. */ u64 cifs_UnixTimeToNT(struct timespec64 t) { /* Convert to 100ns intervals and then add the NTFS time offset. */ return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; } static const int total_days_of_prev_months[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 }; struct timespec64 cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) { struct timespec64 ts; time64_t sec, days; int min, day, month, year; u16 date = le16_to_cpu(le_date); u16 time = le16_to_cpu(le_time); SMB_TIME *st = (SMB_TIME *)&time; SMB_DATE *sd = (SMB_DATE *)&date; cifs_dbg(FYI, "date %d time %d\n", date, time); sec = 2 * st->TwoSeconds; min = st->Minutes; if ((sec > 59) || (min > 59)) cifs_dbg(VFS, "Invalid time min %d sec %lld\n", min, sec); sec += (min * 60); sec += 60 * 60 * st->Hours; if (st->Hours > 24) cifs_dbg(VFS, "Invalid hours %d\n", st->Hours); day = sd->Day; month = sd->Month; if (day < 1 || day > 31 || month < 1 || month > 12) { cifs_dbg(VFS, "Invalid date, month %d day: %d\n", month, day); day = clamp(day, 1, 31); month = clamp(month, 1, 12); } month -= 1; days = day + total_days_of_prev_months[month]; days += 3652; /* account for difference in days between 1980 and 1970 */ year = sd->Year; days += year * 365; days += (year/4); /* leap year */ /* generalized leap year calculation is more complex, ie no leap year for years/100 except for years/400, but since the maximum number for DOS year is 2**7, the last year is 1980+127, which means we need only consider 2 special case years, ie the years 2000 and 2100, and only adjust for the lack of leap year for the year 2100, as 2000 was a leap year (divisible by 400) */ if (year >= 120) /* the year 2100 */ days = days - 1; /* do not count leap year for the year 2100 */ /* adjust for leap year where we are still before leap day */ if (year != 120) days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); sec += 24 * 60 * 60 * days; ts.tv_sec = sec + offset; /* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */ ts.tv_nsec = 0; return ts; }
28 2 43 28 2 35 9 3 148 5 29 10 199 125 125 41 42 42 164 88 28 28 156 15 25 7 15 18 2 1 44 15 15 15 4 11 131 4 13 40 11 11 10 4 245 26 214 36 237 13 25 4 8 133 133 133 133 131 9 164 164 164 77 96 74 12 78 98 98 163 26 3 155 15 164 160 4 11 55 54 6 6 11 11 11 10 10 3 8 4 2 6 2 4 1 1 2 136 20 88 88 51 1 12 28 2 8 49 142 6 136 17 4 6 45 43 1 30 18 96 67 30 9 1 8 8 3 1 9 28 4 17 1 7 6 6 2 3 51 1 1 4 38 8 5 4 17 3 26 12 55 3 3 26 3 25 2 12 16 2 1 10 6 27 27 4 23 17 6 27 10 19 19 48 4 44 2 44 2 42 19 4 3 2 2 8 3 6 11 11 11 18 33 14 20 39 39 39 39 39 6 6 6 13 13 1 4 1 6 6 21 8 12 11 1 8 19 17 1 21 206 5 199 204 204 185 21 2 1 2 2 151 8 142 9 1 1 99 134 17 33 19 165 38 3 6 135 3 152 4 2 138 14 7 1 3 147 129 1 12 14 1 14 13 1 1 23 76 26 76 10 5 97 3 31 57 97 21 3 1 1 16 77 50 31 43 3 26 3 2 3 39 5 3 1 38 3 2 41 40 5 16 4 2 24 61 60 1 58 2 40 1 39 101 6 95 2 92 57 47 12 26 20 73 9 65 64 16 49 10 2 67 6 72 31 14 45 27 23 3 72 61 11 90 4 1 3 59 56 10 17 12 56 54 9 2 3 4 3 4 1 6 74 19 7 5 2 4 1 5 1 11 8 8 1 110 2 102 101 103 58 74 5 21 6 64 20 5 59 11 11 50 4 6 64 10 24 40 40 2 44 38 5 22 17 2 20 2 14 12 64 19 54 4 90 12 83 15 26 23 3 45 2 35 7 36 11 20 9 12 10 36 10 1 14 12 3 2 10 1 1 1 1 150 131 1 9 4 4 2 1 1 1 51 51 51 50 86 86 86 86 86 3 5 31 7 15 12 11 12 12 3 10 12 12 10 12 12 10 12 11 7 12 12 7 5 1 6 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/filter.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/af_unix.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/scm.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/mount.h> #include <net/checksum.h> #include <linux/security.h> #include <linux/splice.h> #include <linux/freezer.h> #include <linux/file.h> #include <linux/btf_ids.h> #include <linux/bpf-cgroup.h> static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock. */ #ifdef CONFIG_PROVE_LOCKING #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) static int unix_table_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { return cmp_ptr(a, b); } static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) { const struct unix_sock *a, *b; a = container_of(_a, struct unix_sock, lock.dep_map); b = container_of(_b, struct unix_sock, lock.dep_map); if (a->sk.sk_state == TCP_LISTEN) { /* unix_stream_connect(): Before the 2nd unix_state_lock(), * * 1. a is TCP_LISTEN. * 2. b is not a. * 3. concurrent connect(b -> a) must fail. * * Except for 2. & 3., the b's state can be any possible * value due to concurrent connect() or listen(). * * 2. is detected in debug_spin_lock_before(), and 3. cannot * be expressed as lock_cmp_fn. */ switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: case TCP_LISTEN: return -1; default: /* Invalid case. */ return 0; } } /* Should never happen. Just to be symmetric. */ if (b->sk.sk_state == TCP_LISTEN) { switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: return 1; default: return 0; } } /* unix_state_double_lock(): ascending address order. */ return cmp_ptr(a, b); } static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) { const struct sock *a, *b; a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); /* unix_collect_skb(): listener -> embryo order. */ if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) return -1; /* Should never happen. Just to be symmetric. */ if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) return 1; return 0; } #endif static unsigned int unix_unbound_hash(struct sock *sk) { unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; hash ^= sk->sk_type; return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) { return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, int addr_len, int type) { __wsum csum = csum_partial(sunaddr, addr_len, 0); unsigned int hash; hash = (__force unsigned int)csum_fold(csum); hash ^= hash >> 8; hash ^= type; return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static void unix_table_double_lock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_lock(&net->unx.table.locks[hash1]); return; } if (hash1 > hash2) swap(hash1, hash2); spin_lock(&net->unx.table.locks[hash1]); spin_lock(&net->unx.table.locks[hash2]); } static void unix_table_double_unlock(struct net *net, unsigned int hash1, unsigned int hash2) { if (hash1 == hash2) { spin_unlock(&net->unx.table.locks[hash1]); return; } spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { scm->secid = UNIXCB(skb).secid; } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) { return true; } #endif /* CONFIG_SECURITY_NETWORK */ static inline int unix_our_peer(struct sock *sk, struct sock *osk) { return unix_peer(osk) == sk; } static inline int unix_may_send(struct sock *sk, struct sock *osk) { return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } static inline int unix_recvq_full_lockless(const struct sock *sk) { return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } struct sock *unix_peer_get(struct sock *s) { struct sock *peer; unix_state_lock(s); peer = unix_peer(s); if (peer) sock_hold(peer); unix_state_unlock(s); return peer; } EXPORT_SYMBOL_GPL(unix_peer_get); static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, int addr_len) { struct unix_address *addr; addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); if (!addr) return NULL; refcount_set(&addr->refcnt, 1); addr->len = addr_len; memcpy(addr->name, sunaddr, addr_len); return addr; } static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) kfree(addr); } /* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name. */ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) { if (addr_len <= offsetof(struct sockaddr_un, sun_path) || addr_len > sizeof(*sunaddr)) return -EINVAL; if (sunaddr->sun_family != AF_UNIX) return -EINVAL; return 0; } static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) { struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; short offset = offsetof(struct sockaddr_storage, __data); BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); /* This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer because syscall functions always pass * a pointer of struct sockaddr_storage which has a bigger buffer * than 108. Also, we must terminate sun_path for strlen() in * getname_kernel(). */ addr->__data[addr_len - offset] = 0; /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() * know the actual buffer. */ return strlen(addr->__data) + offset + 1; } static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); } static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } static void __unix_set_addr_hash(struct net *net, struct sock *sk, struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; __unix_insert_socket(net, sk); } static void unix_remove_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); } static void unix_insert_bsd_socket(struct sock *sk) { spin_lock(&bsd_socket_locks[sk->sk_hash]); sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); spin_unlock(&bsd_socket_locks[sk->sk_hash]); } static void unix_remove_bsd_socket(struct sock *sk) { if (!hlist_unhashed(&sk->sk_bind_node)) { spin_lock(&bsd_socket_locks[sk->sk_hash]); __sk_del_bind_node(sk); spin_unlock(&bsd_socket_locks[sk->sk_hash]); sk_node_init(&sk->sk_bind_node); } } static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; } return NULL; } static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) { struct sock *s; spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); return s; } static struct sock *unix_find_socket_byinode(struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&bsd_socket_locks[hash]); sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); spin_unlock(&bsd_socket_locks[hash]); return s; } } spin_unlock(&bsd_socket_locks[hash]); return NULL; } /* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed. */ static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; wait_queue_head_t *u_sleep; u = container_of(q, struct unix_sock, peer_wake); __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, q); u->peer_wake.private = NULL; /* relaying can only happen while the wq still exists */ u_sleep = sk_sleep(&u->sk); if (u_sleep) wake_up_interruptible_poll(u_sleep, key_to_poll(key)); return 0; } static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; int rc; u = unix_sk(sk); u_other = unix_sk(other); rc = 0; spin_lock(&u_other->peer_wait.lock); if (!u->peer_wake.private) { u->peer_wake.private = other; __add_wait_queue(&u_other->peer_wait, &u->peer_wake); rc = 1; } spin_unlock(&u_other->peer_wait.lock); return rc; } static void unix_dgram_peer_wake_disconnect(struct sock *sk, struct sock *other) { struct unix_sock *u, *u_other; u = unix_sk(sk); u_other = unix_sk(other); spin_lock(&u_other->peer_wait.lock); if (u->peer_wake.private == other) { __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); u->peer_wake.private = NULL; } spin_unlock(&u_other->peer_wait.lock); } static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, struct sock *other) { unix_dgram_peer_wake_disconnect(sk, other); wake_up_interruptible_poll(sk_sleep(sk), EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } /* preconditions: * - unix_peer(sk) == other * - association is stable */ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) { int connected; connected = unix_dgram_peer_wake_connect(sk, other); /* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) unix_dgram_peer_wake_disconnect(sk, other); return 0; } static int unix_writable(const struct sock *sk, unsigned char state) { return state != TCP_LISTEN && (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); } static void unix_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); if (unix_writable(sk, READ_ONCE(sk->sk_state))) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer * may receive messages only from that peer. */ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { skb_queue_purge(&sk->sk_receive_queue); wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us. */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { WRITE_ONCE(other->sk_err, ECONNRESET); sk_error_report(other); } } } static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); skb_queue_purge(&sk->sk_receive_queue); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; } if (u->addr) unix_release_addr(u->addr); atomic_long_dec(&unix_nr_socks); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); #ifdef UNIX_REFCNT_DEBUG pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); #endif } static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); struct sock *skpair; struct sk_buff *skb; struct path path; int state; unix_remove_socket(sock_net(sk), sk); unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); sock_orphan(sk); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; WRITE_ONCE(sk->sk_state, TCP_CLOSE); skpair = unix_peer(sk); unix_peer(sk) = NULL; unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) u->oob_skb = NULL; #endif wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ } /* Try to flush out this socket. Throw out buffers at least */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ kfree_skb(skb); } if (path.dentry) path_put(&path); sock_put(sk); /* ---- Socket is dead now and most probably destroyed ---- */ /* * Fixme: BSD difference: In BSD all sockets connected to us get * ECONNRESET and we die on the spot. In Linux we behave * like files and pipes do and wait for the last * dereference. * * Can't we simply set sock->err? * * What the above comment does talk about? --ANK(980817) */ if (READ_ONCE(unix_tot_inflight)) unix_gc(); /* Garbage collect fds */ } static void init_peercred(struct sock *sk) { sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); } static void update_peercred(struct sock *sk) { const struct cred *old_cred; struct pid *old_pid; spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; init_peercred(sk); spin_unlock(&sk->sk_peer_lock); put_pid(old_pid); put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { lockdep_assert_held(&unix_sk(peersk)->lock); spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ update_peercred(sk); err = 0; out_unlock: unix_state_unlock(sk); out: return err; } static int unix_release(struct socket *); static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); #endif static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, struct pipe_inode_info *, size_t size, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, int); #ifdef CONFIG_PROC_FS static int unix_count_nr_fds(struct sock *sk) { struct sk_buff *skb; struct unix_sock *u; int nr_fds = 0; spin_lock(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); while (skb) { u = unix_sk(skb->sk); nr_fds += atomic_read(&u->scm_stat.nr_fds); skb = skb_peek_next(skb, &sk->sk_receive_queue); } spin_unlock(&sk->sk_receive_queue.lock); return nr_fds; } static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; unsigned char s_state; struct unix_sock *u; int nr_fds = 0; if (sk) { s_state = READ_ONCE(sk->sk_state); u = unix_sk(sk); /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. * SOCK_DGRAM is ordinary. So, no lock is needed. */ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) nr_fds = atomic_read(&u->scm_stat.nr_fds); else if (s_state == TCP_LISTEN) nr_fds = unix_count_nr_fds(sk); seq_printf(m, "scm_fds: %u\n", nr_fds); } } #else #define unix_show_fdinfo NULL #endif static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .splice_read = unix_stream_splice_read, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_dgram_connect, .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, .bind = unix_bind, .connect = unix_stream_connect, .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, .poll = unix_dgram_poll, .ioctl = unix_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = unix_compat_ioctl, #endif .listen = unix_listen, .shutdown = unix_shutdown, .sendmsg = unix_seqpacket_sendmsg, .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .set_peek_off = sk_set_peek_off, .show_fdinfo = unix_show_fdinfo, }; static void unix_close(struct sock *sk, long timeout) { /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap. */ } static void unix_unhash(struct sock *sk) { /* Nothing to do here, unix socket does not need a ->unhash(). * This is merely for sockmap. */ } static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { switch (optname) { case SO_PEERPIDFD: return true; default: return false; } } return false; } struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_dgram_bpf_update_proto, #endif }; struct proto unix_stream_proto = { .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct unix_sock *u; struct sock *sk; int err; atomic_long_inc(&unix_nr_socks); if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { err = -ENFILE; goto err; } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); if (!sk) { err = -ENOMEM; goto err; } sock_init_data(sock, sk); sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); u = unix_sk(sk); u->listener = NULL; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_unbound_socket(net, sk); sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; err: atomic_long_dec(&unix_nr_socks); return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; switch (sock->type) { case SOCK_STREAM: sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it. */ case SOCK_RAW: sock->type = SOCK_DGRAM; fallthrough; case SOCK_DGRAM: sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET: sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk); return 0; } static int unix_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; return 0; } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int type) { struct inode *inode; struct path path; struct sock *sk; int err; unix_mkname_bsd(sunaddr, addr_len); err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; err = path_permission(&path, MAY_WRITE); if (err) goto path_put; err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); if (!S_ISSOCK(inode->i_mode)) goto path_put; sk = unix_find_socket_byinode(inode); if (!sk) goto path_put; err = -EPROTOTYPE; if (sk->sk_type == type) touch_atime(&path); else goto sock_put; path_put(&path); return sk; sock_put: sock_put(sk); path_put: path_put(&path); fail: return ERR_PTR(err); } static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); struct dentry *dentry; struct sock *sk; sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); if (!sk) return ERR_PTR(-ECONNREFUSED); dentry = unix_sk(sk)->path.dentry; if (dentry) touch_atime(&unix_sk(sk)->path); return sk; } static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { struct sock *sk; if (sunaddr->sun_path[0]) sk = unix_find_bsd(sunaddr, addr_len, type); else sk = unix_find_abstract(net, sunaddr, addr_len, type); return sk; } static int unix_autobind(struct sock *sk) { struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; err = mutex_lock_interruptible(&u->bindlock); if (err) return err; if (u->addr) goto out; err = -ENOMEM; addr = kzalloc(sizeof(*addr) + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); old_hash = sk->sk_hash; ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); if (ordernum == lastnum) { /* Give up if all names seems to be in use. */ err = -ENOSPC; unix_release_addr(addr); goto out; } goto retry; } __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); return err; } static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; struct dentry *dentry; struct path parent; int err; addr_len = unix_mkname_bsd(sunaddr, addr_len); addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; /* * Get the parent directory, calculate the hash for last * component. */ dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; } /* * All right, let's create it. */ idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock; old_hash = sk->sk_hash; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; out_unlock: mutex_unlock(&u->bindlock); err = -EINVAL; out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: done_path_create(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; } static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { struct unix_sock *u = unix_sk(sk); unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; int err; addr = unix_create_addr(sunaddr, addr_len); if (!addr) return -ENOMEM; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out; if (u->addr) { err = -EINVAL; goto out_mutex; } old_hash = sk->sk_hash; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); out: unix_release_addr(addr); return err; } static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && sunaddr->sun_family == AF_UNIX) return unix_autobind(sk); err = unix_validate_addr(sunaddr, addr_len); if (err) return err; if (sunaddr->sun_path[0]) err = unix_bind_bsd(sk, sunaddr, addr_len); else err = unix_bind_abstract(sk, sunaddr, addr_len); return err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_lock(sk1); return; } if (sk1 > sk2) swap(sk1, sk2); unix_state_lock(sk1); unix_state_lock(sk2); } static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) { if (unlikely(sk1 == sk2) || !sk2) { unix_state_unlock(sk1); return; } unix_state_unlock(sk1); unix_state_unlock(sk2); } static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *sk = sock->sk; struct sock *other; int err; err = -EINVAL; if (alen < offsetofend(struct sockaddr, sa_family)) goto out; if (addr->sa_family != AF_UNSPEC) { err = unix_validate_addr(sunaddr, alen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); if (err) goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; } restart: other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; } unix_state_double_lock(sk, other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_double_unlock(sk, other); sock_put(other); goto restart; } err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); } else { /* * 1003.1g breaking connected state with AF_UNSPEC */ other = NULL; unix_state_double_lock(sk, other); } /* * If it was connected, reconnect. */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; if (!other) WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); if (other != old_peer) { unix_dgram_disconnected(sk, old_peer); unix_state_lock(old_peer); if (!unix_peer(old_peer)) WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); unix_state_unlock(old_peer); } sock_put(old_peer); } else { unix_peer(sk) = other; unix_state_double_unlock(sk, other); } return 0; out_unlock: unix_state_double_unlock(sk, other); sock_put(other); out: return err; } static long unix_wait_for_peer(struct sock *other, long timeo) __releases(&unix_sk(other)->lock) { struct unix_sock *u = unix_sk(other); int sched; DEFINE_WAIT(wait); prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && unix_recvq_full_lockless(other); unix_state_unlock(other); if (sched) timeo = schedule_timeout(timeo); finish_wait(&u->peer_wait, &wait); return timeo; } static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; long timeo; int err; err = unix_validate_addr(sunaddr, addr_len); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); if (err) goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. If we will make it after state is locked, we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); newsk = NULL; goto out; } err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (skb == NULL) goto out; restart: /* Find listening sock. */ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; goto out; } unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { unix_state_unlock(other); sock_put(other); goto restart; } err = -ECONNREFUSED; if (other->sk_state != TCP_LISTEN) goto out_unlock; if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; if (unix_recvq_full_lockless(other)) { err = -EAGAIN; if (!timeo) goto out_unlock; timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; sock_put(other); goto restart; } /* self connect and simultaneous connect are eliminated * by rejecting TCP_LISTEN socket to avoid deadlock. */ state = READ_ONCE(sk->sk_state); if (unlikely(state != TCP_CLOSE)) { err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; goto out_unlock; } unix_state_lock(sk); if (unlikely(sk->sk_state != TCP_CLOSE)) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; unix_state_unlock(sk); goto out_unlock; } err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; } /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); /* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under its lock. Insertion into the * hash chain we'd found it in had been done in an * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind(). */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } refcount_inc(&otheru->addr->refcnt); smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); sock->state = SS_CONNECTED; WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); return 0; out_unlock: if (other) unix_state_unlock(other); out: kfree_skb(skb); if (newsk) unix_release_sock(newsk, 0); if (other) sock_put(other); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { struct sock *ska = socka->sk, *skb = sockb->sk; /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; init_peercred(ska); init_peercred(skb); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; socka->state = SS_CONNECTED; sockb->state = SS_CONNECTED; return 0; } static void unix_sock_inherit_flags(const struct socket *old, struct socket *new) { if (test_bit(SOCK_PASSCRED, &old->flags)) set_bit(SOCK_PASSCRED, &new->flags); if (test_bit(SOCK_PASSPIDFD, &old->flags)) set_bit(SOCK_PASSPIDFD, &new->flags); if (test_bit(SOCK_PASSSEC, &old->flags)) set_bit(SOCK_PASSSEC, &new->flags); } static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; struct sock *tsk; arg->err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; arg->err = -EINVAL; if (READ_ONCE(sk->sk_state) != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), * so that no locks are necessary. */ skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, &arg->err); if (!skb) { /* This means receive shutdown. */ if (arg->err == 0) arg->err = -EINVAL; goto out; } tsk = skb->sk; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; out: return arg->err; } static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; if (peer) { sk = unix_peer_get(sk); err = -ENOTCONN; if (!sk) goto out; err = 0; } else { sock_hold(sk); } addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); if (peer) BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETPEERNAME); else BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, CGROUP_UNIX_GETSOCKNAME); } sock_put(sk); out: return err; } /* The "user->unix_inflight" variable is protected by the garbage * collection lock, and we just read it locklessly here. If you go * over the limit, there might be a tiny race in actually noticing * it across threads. Tough. */ static inline bool too_many_unix_fds(struct task_struct *p) { struct user_struct *user = current_user(); if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); return false; } static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { if (too_many_unix_fds(current)) return -ETOOMANYREFS; UNIXCB(skb).fp = scm->fp; scm->fp = NULL; if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; return 0; } static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); } static void unix_destruct_scm(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); /* Alas, it calls VFS */ /* So fscking what? fput() had been SMP-safe since the last Summer */ scm_destroy(&scm); sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); skb->destructor = unix_destruct_scm; return err; } static bool unix_passcred_enabled(const struct socket *sock, const struct sock *other) { return test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags) || !other->sk_socket || test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); } /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. */ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, const struct sock *other) { if (UNIXCB(skb).pid) return; if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { return UNIXCB(skb).pid == scm->pid && uid_eq(UNIXCB(skb).uid, scm->creds.uid) && gid_eq(UNIXCB(skb).gid, scm->creds.gid) && unix_secdata_eq(scm, skb); } static void scm_stat_add(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); unix_add_edges(fp, u); } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) { struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); unix_del_edges(fp); } } /* * Send AF_UNIX data. */ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; struct sk_buff *skb; int data_len = 0; int sk_locked; long timeo; int err; err = scm_send(sock, msg, &scm, false); if (err < 0) return err; wait_for_unix_gc(scm.fp); err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; if (msg->msg_namelen) { err = unix_validate_addr(sunaddr, msg->msg_namelen); if (err) goto out; err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen, NULL); if (err) goto out; } else { sunaddr = NULL; err = -ENOTCONN; other = unix_peer_get(sk); if (!other) goto out; } if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } err = -EMSGSIZE; if (len > READ_ONCE(sk->sk_sndbuf) - 32) goto out; if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, len - SKB_MAX_ALLOC, MAX_SKB_FRAGS * PAGE_SIZE); data_len = PAGE_ALIGN(data_len); BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); } skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; skb_put(skb, len - data_len); skb->data_len = data_len; skb->len = len; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); if (err) goto out_free; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); restart: if (!other) { err = -ECONNRESET; if (sunaddr == NULL) goto out_free; other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; goto out_free; } } if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; goto out_free; } sk_locked = 0; unix_state_lock(other); restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error */ unix_state_unlock(other); sock_put(other); if (!sk_locked) unix_state_lock(sk); err = 0; if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ unix_state_unlock(sk); err = -EPIPE; } else if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_state_unlock(sk); unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; } else { unix_state_unlock(sk); } other = NULL; if (err) goto out_free; goto restart; } err = -EPIPE; if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock; } /* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && unlikely(unix_peer(other) != sk && unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_free; goto restart; } if (!sk_locked) { unix_state_unlock(other); unix_state_double_lock(sk, other); } if (unix_peer(sk) != other || unix_dgram_peer_wake_me(sk, other)) { err = -EAGAIN; sk_locked = 1; goto out_unlock; } if (!sk_locked) { sk_locked = 1; goto restart_locked; } } if (unlikely(sk_locked)) unix_state_unlock(sk); if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); scm_destroy(&scm); return len; out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); out: if (other) sock_put(other); scm_destroy(&scm); return err; } /* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page. */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err = 0; skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; err = unix_scm_to_skb(scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); return err; } skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); if (err) { kfree_skb(skb); return err; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { unix_state_unlock(other); kfree_skb(skb); return -EPIPE; } maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); sk_send_sigurg(other); unix_state_unlock(other); other->sk_data_ready(other); return err; } #endif static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sock *other = NULL; int err, size; struct sk_buff *skb; int sent = 0; struct scm_cookie scm; bool fds_sent = false; int data_len; err = scm_send(sock, msg, &scm, false); if (err < 0) return err; wait_for_unix_gc(scm.fp); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else #endif goto out_err; } if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; other = unix_peer(sk); if (!other) goto out_err; } if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto pipe_err; while (sent < len) { size = len - sent; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, msg->msg_flags & MSG_DONTWAIT, &err, 0); } else { /* Keep two messages in the pipe so it schedules better */ size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, get_order(UNIX_SKB_FRAGS_SZ)); } if (!skb) goto out_err; /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); goto out_err; } fds_sent = true; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { err = skb_splice_from_iter(skb, &msg->msg_iter, size, sk->sk_allocation); if (err < 0) { kfree_skb(skb); goto out_err; } size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { skb_put(skb, size - data_len); skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) { kfree_skb(skb); goto out_err; } } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) goto pipe_err_free; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other); sent += size; } #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sock, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; } #endif scm_destroy(&scm); return sent; pipe_err_free: unix_state_unlock(other); kfree_skb(skb); pipe_err: if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: scm_destroy(&scm); return sent ? : err; } static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk = sock->sk; err = sock_error(sk); if (err) return err; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) msg->msg_namelen = 0; return unix_dgram_sendmsg(sock, msg, len); } static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (addr) { msg->msg_namelen = addr->len; memcpy(msg->msg_name, addr->name, addr->len); } } int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct scm_cookie scm; struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; int skip; int err; err = -EOPNOTSUPP; if (flags&MSG_OOB) goto out; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, &skip, &err, &last); if (skb) { if (!(flags & MSG_PEEK)) scm_stat_del(sk, skb); break; } mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) err = 0; unix_state_unlock(sk); goto out; } if (wq_has_sleeper(&u->peer_wait)) wake_up_interruptible_sync_poll(&u->peer_wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (msg->msg_name) { unix_copy_addr(msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, &msg->msg_namelen); } if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; if (sock_flag(sk, SOCK_RCVTSTAMP)) __sock_recv_timestamp(msg, sk, skb); memset(&scm, 0, sizeof(scm)); scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); sk_peek_offset_bwd(sk, skb->len); } else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution) POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however! */ sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; scm_recv_unix(sock, msg, &scm, flags); out_free: skb_free_datagram(sk, skb); mutex_unlock(&u->iolock); out: return err; } static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) return prot->recvmsg(sk, msg, size, flags, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); mutex_unlock(&u->iolock); if (!skb) return err; return recv_actor(sk, skb); } /* * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) break; sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } static unsigned int unix_skb_len(const struct sk_buff *skb) { return skb->len - UNIXCB(skb).consumed; } struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); struct socket *socket; struct msghdr *msg; struct pipe_inode_info *pipe; size_t size; int flags; unsigned int splice_flags; }; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); spin_lock(&sk->sk_receive_queue.lock); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; } oob_skb = u->oob_skb; if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; mutex_unlock(&u->iolock); if (chunk < 0) return -EFAULT; state->msg->msg_flags |= MSG_OOB; return 1; } static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, int flags, int copied) { struct sk_buff *read_skb = NULL, *unread_skb = NULL; struct unix_sock *u = unix_sk(sk); if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) return skb; spin_lock(&sk->sk_receive_queue.lock); if (!unix_skb_len(skb)) { if (copied && (!u->oob_skb || skb == u->oob_skb)) { skb = NULL; } else if (flags & MSG_PEEK) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } else { read_skb = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); __skb_unlink(read_skb, &sk->sk_receive_queue); } if (!skb) goto unlock; } if (skb != u->oob_skb) goto unlock; if (copied) { skb = NULL; } else if (!(flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); if (!sock_flag(sk, SOCK_URGINLINE)) { __skb_unlink(skb, &sk->sk_receive_queue); unread_skb = skb; skb = skb_peek(&sk->sk_receive_queue); } } else if (!sock_flag(sk, SOCK_URGINLINE)) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); kfree_skb(unread_skb); return skb; } #endif static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); mutex_unlock(&u->iolock); if (!skb) return err; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (unlikely(skb == READ_ONCE(u->oob_skb))) { bool drop = false; unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { unix_state_unlock(sk); kfree_skb(skb); return -ECONNRESET; } spin_lock(&sk->sk_receive_queue.lock); if (likely(skb == u->oob_skb)) { WRITE_ONCE(u->oob_skb, NULL); drop = true; } spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); if (drop) { kfree_skb(skb); return -EAGAIN; } } #endif return recv_actor(sk, skb); } static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { struct scm_cookie scm; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int copied = 0; int flags = state->flags; int noblock = flags & MSG_DONTWAIT; bool check_creds = false; int target; int err = 0; long timeo; int skip; size_t size = state->size; unsigned int last_len; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) err = unix_stream_recv_urg(state); #endif goto out; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); memset(&scm, 0, sizeof(scm)); /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ mutex_lock(&u->iolock); skip = max(sk_peek_offset(sk, flags), 0); do { struct sk_buff *skb, *last; int chunk; redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); if (!skb && copied) { unix_state_unlock(sk); break; } } #endif if (skb == NULL) { if (copied >= target) goto unlock; /* * POSIX 1003.1g mandates this order. */ err = sock_error(sk); if (err) goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; unix_state_unlock(sk); if (!timeo) { err = -EAGAIN; break; } mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, last_len, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); scm_destroy(&scm); goto out; } mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); break; } while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; } unix_state_unlock(sk); if (check_creds) { /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); check_creds = true; } /* Copy address just once */ if (state->msg && state->msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, state->msg->msg_name); unix_copy_addr(state->msg, skb->sk); BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, state->msg->msg_name, &state->msg->msg_namelen); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); chunk = state->recv_actor(skb, skip, chunk, state); if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; } copied += chunk; size -= chunk; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) { scm_stat_del(sk, skb); unix_detach_fds(&scm, skb); } if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (scm.fp) break; } else { /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); if (UNIXCB(skb).fp) break; skip = 0; last = skb; last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) goto again; unix_state_unlock(sk); break; } } while (size); mutex_unlock(&u->iolock); if (state->msg) scm_recv_unix(sock, state->msg, &scm, flags); else scm_destroy(&scm); out: return copied ? : err; } static int unix_stream_read_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { int ret; ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, state->msg, chunk); return ret ?: chunk; } int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sk->sk_socket, .msg = msg, .size = size, .flags = flags }; return unix_stream_read_generic(&state, true); } static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_read_actor, .socket = sock, .msg = msg, .size = size, .flags = flags }; #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) return prot->recvmsg(sk, msg, size, flags, NULL); #endif return unix_stream_read_generic(&state, true); } static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t size, unsigned int flags) { struct unix_stream_read_state state = { .recv_actor = unix_stream_splice_actor, .socket = sock, .pipe = pipe, .size = size, .splice_flags = flags, }; if (unlikely(*ppos)) return -ESPIPE; if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) state.flags = MSG_DONTWAIT; return unix_stream_read_generic(&state, false); } static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; struct sock *other; if (mode < SHUT_RD || mode > SHUT_RDWR) return -EINVAL; /* This maps: * SHUT_RD (0) -> RCV_SHUTDOWN (1) * SHUT_WR (1) -> SEND_SHUTDOWN (2) * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) */ ++mode; unix_state_lock(sk); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); unix_state_unlock(sk); sk->sk_state_change(sk); if (other && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); if (prot->unhash) prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); } if (other) sock_put(other); return 0; } long unix_inq_len(struct sock *sk) { struct sk_buff *skb; long amount = 0; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; } spin_unlock(&sk->sk_receive_queue.lock); return amount; } EXPORT_SYMBOL_GPL(unix_inq_len); long unix_outq_len(struct sock *sk) { return sk_wmem_alloc_get(sk); } EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { struct path path; struct file *f; int fd; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; path = unix_sk(sk)->path; if (!path.dentry) return -ENOENT; path_get(&path); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out; f = dentry_open(&path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); goto out; } fd_install(fd, f); out: path_put(&path); return fd; } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; long amount = 0; int err; switch (cmd) { case SIOCOUTQ: amount = unix_outq_len(sk); err = put_user(amount, (int __user *)arg); break; case SIOCINQ: amount = unix_inq_len(sk); if (amount < 0) err = amount; else err = put_user(amount, (int __user *)arg); break; case SIOCUNIXFILE: err = unix_open_file(sk); break; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int answ = 0; mutex_lock(&u->iolock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); struct sk_buff *next_skb; next_skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb == oob_skb || (!unix_skb_len(skb) && (!oob_skb || next_skb == oob_skb))) answ = 1; } mutex_unlock(&u->iolock); err = put_user(answ, (int __user *)arg); } break; #endif default: err = -ENOIOCTLCMD; break; } return err; } #ifdef CONFIG_COMPAT static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } #endif static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (READ_ONCE(unix_sk(sk)->oob_skb)) mask |= EPOLLPRI; #endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ if (unix_writable(sk, state)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk, *other; unsigned int writable; unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) mask |= EPOLLHUP; /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk, state); if (writable) { unix_state_lock(sk); other = unix_peer(sk); if (other && unix_peer(other) != sk && unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; unix_state_unlock(sk); } if (writable) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } #ifdef CONFIG_PROC_FS #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); unsigned long count = 0; struct sock *sk; for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); sk; sk = sk_next(sk)) { if (++count == offset) break; } return sk; } static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); struct net *net = seq_file_net(seq); struct sock *sk; while (bucket < UNIX_HASH_SIZE) { spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; spin_unlock(&net->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } return NULL; } static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); sk = sk_next(sk); if (sk) return sk; spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); return unix_get_first(seq, pos); } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; return unix_get_first(seq, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (v == SEQ_START_TOKEN) return unix_get_first(seq, pos); return unix_get_next(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; if (sk) spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Num RefCount Protocol Flags Type St " "Inode Path\n"); else { struct sock *s = v; struct unix_sock *u = unix_sk(s); unix_state_lock(s); seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", s, refcount_read(&s->sk_refcnt), 0, s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, s->sk_type, s->sk_socket ? (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); i = 0; len = u->addr->len - offsetof(struct sockaddr_un, sun_path); if (u->addr->name->sun_path[0]) { len--; } else { seq_putc(seq, '@'); i++; } for ( ; i < len; i++) seq_putc(seq, u->addr->name->sun_path[i] ?: '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); } return 0; } static const struct seq_operations unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = unix_seq_stop, .show = unix_seq_show, }; #ifdef CONFIG_BPF_SYSCALL struct bpf_unix_iter_state { struct seq_net_private p; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; struct sock **batch; bool st_bucket_done; }; struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); uid_t uid __aligned(8); }; static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) { struct bpf_iter__unix ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.unix_sk = unix_sk; ctx.uid = uid; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; } expected++; } spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); return expected; } static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) { while (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); } static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, unsigned int new_batch_sz) { struct sock **new_batch; new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, GFP_USER | __GFP_NOWARN); if (!new_batch) return -ENOMEM; bpf_iter_unix_put_batch(iter); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } static struct sock *bpf_iter_unix_batch(struct seq_file *seq, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; unsigned int expected; bool resized = false; struct sock *sk; if (iter->st_bucket_done) *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); again: /* Get a new batch */ iter->cur_sk = 0; iter->end_sk = 0; sk = unix_get_first(seq, pos); if (!sk) return NULL; /* Done */ expected = bpf_iter_unix_hold_batch(seq, sk); if (iter->end_sk == expected) { iter->st_bucket_done = true; return sk; } if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { resized = true; goto again; } return sk; } static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ return bpf_iter_unix_batch(seq, pos); } static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_unix_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so advance to the next sk in * the batch. */ if (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); ++*pos; if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk]; else sk = bpf_iter_unix_batch(seq, pos); return sk; } static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; bool slow; int ret; if (v == SEQ_START_TOKEN) return 0; slow = lock_sock_fast(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); unlock: unlock_sock_fast(sk, slow); return ret; } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { struct bpf_unix_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)unix_prog_seq_show(prog, &meta, v, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_unix_put_batch(iter); } static const struct seq_operations bpf_iter_unix_seq_ops = { .start = bpf_iter_unix_seq_start, .next = bpf_iter_unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; #endif #endif static const struct net_proto_family unix_family_ops = { .family = PF_UNIX, .create = unix_create, .owner = THIS_MODULE, }; static int __net_init unix_net_init(struct net *net) { int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) goto out; #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, sizeof(struct seq_net_private))) goto err_sysctl; #endif net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, sizeof(spinlock_t), GFP_KERNEL); if (!net->unx.table.locks) goto err_proc; net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->unx.table.buckets) goto free_locks; for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock_init(&net->unx.table.locks[i]); lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } return 0; free_locks: kvfree(net->unx.table.locks); err_proc: #ifdef CONFIG_PROC_FS remove_proc_entry("unix", net->proc_net); err_sysctl: #endif unix_sysctl_unregister(net); out: return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { kvfree(net->unx.table.buckets); kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) #define INIT_BATCH_SZ 16 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_unix_iter_state *iter = priv_data; int err; err = bpf_iter_init_seq_net(priv_data, aux); if (err) return err; err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); if (err) { bpf_iter_fini_seq_net(priv_data); return err; } return 0; } static void bpf_iter_fini_unix(void *priv_data) { struct bpf_unix_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, .init_seq_private = bpf_iter_init_unix, .fini_seq_private = bpf_iter_fini_unix, .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; static const struct bpf_func_proto * bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sk_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sk_getsockopt_proto; default: return NULL; } } static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, .get_func_proto = bpf_iter_unix_get_func_proto, .seq_info = &unix_seq_info, }; static void __init bpf_iter_register(void) { unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; if (bpf_iter_reg_target(&unix_reg_info)) pr_warn("Warning: could not register bpf iterator unix\n"); } #endif static int __init af_unix_init(void) { int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { spin_lock_init(&bsd_socket_locks[i]); INIT_HLIST_HEAD(&bsd_socket_buckets[i]); } rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; } rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); proto_unregister(&unix_dgram_proto); goto out; } sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif out: return rc; } /* Later than subsys_initcall() because we depend on stuff initialised there */ fs_initcall(af_unix_init);
5 43 42 41 42 43 42 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 // SPDX-License-Identifier: GPL-2.0-or-later /* * acpi_bus.c - ACPI Bus Driver ($Revision: 80 $) * * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/sched.h> #include <linux/pm.h> #include <linux/device.h> #include <linux/proc_fs.h> #include <linux/acpi.h> #include <linux/slab.h> #include <linux/regulator/machine.h> #include <linux/workqueue.h> #include <linux/reboot.h> #include <linux/delay.h> #ifdef CONFIG_X86 #include <asm/mpspec.h> #include <linux/dmi.h> #endif #include <linux/acpi_viot.h> #include <linux/pci.h> #include <acpi/apei.h> #include <linux/suspend.h> #include <linux/prmt.h> #include "internal.h" struct acpi_device *acpi_root; struct proc_dir_entry *acpi_root_dir; EXPORT_SYMBOL(acpi_root_dir); #ifdef CONFIG_X86 #ifdef CONFIG_ACPI_CUSTOM_DSDT static inline int set_copy_dsdt(const struct dmi_system_id *id) { return 0; } #else static int set_copy_dsdt(const struct dmi_system_id *id) { pr_notice("%s detected - force copy of DSDT to local memory\n", id->ident); acpi_gbl_copy_dsdt_locally = 1; return 0; } #endif static const struct dmi_system_id dsdt_dmi_table[] __initconst = { /* * Invoke DSDT corruption work-around on all Toshiba Satellite. * https://bugzilla.kernel.org/show_bug.cgi?id=14679 */ { .callback = set_copy_dsdt, .ident = "TOSHIBA Satellite", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), DMI_MATCH(DMI_PRODUCT_NAME, "Satellite"), }, }, {} }; #endif /* -------------------------------------------------------------------------- Device Management -------------------------------------------------------------------------- */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta) { acpi_status status; status = acpi_evaluate_integer(handle, "_STA", NULL, sta); if (ACPI_SUCCESS(status)) return AE_OK; if (status == AE_NOT_FOUND) { *sta = ACPI_STA_DEVICE_PRESENT | ACPI_STA_DEVICE_ENABLED | ACPI_STA_DEVICE_UI | ACPI_STA_DEVICE_FUNCTIONING; return AE_OK; } return status; } EXPORT_SYMBOL_GPL(acpi_bus_get_status_handle); int acpi_bus_get_status(struct acpi_device *device) { acpi_status status; unsigned long long sta; if (acpi_device_override_status(device, &sta)) { acpi_set_device_status(device, sta); return 0; } /* Battery devices must have their deps met before calling _STA */ if (acpi_device_is_battery(device) && device->dep_unmet) { acpi_set_device_status(device, 0); return 0; } status = acpi_bus_get_status_handle(device->handle, &sta); if (ACPI_FAILURE(status)) return -ENODEV; if (!device->status.present && device->status.enabled) { pr_info(FW_BUG "Device [%s] status [%08x]: not present and enabled\n", device->pnp.bus_id, (u32)sta); device->status.enabled = 0; /* * The status is clearly invalid, so clear the functional bit as * well to avoid attempting to use the device. */ device->status.functional = 0; } acpi_set_device_status(device, sta); if (device->status.functional && !device->status.present) { pr_debug("Device [%s] status [%08x]: functional but not present\n", device->pnp.bus_id, (u32)sta); } pr_debug("Device [%s] status [%08x]\n", device->pnp.bus_id, (u32)sta); return 0; } EXPORT_SYMBOL(acpi_bus_get_status); void acpi_bus_private_data_handler(acpi_handle handle, void *context) { return; } EXPORT_SYMBOL(acpi_bus_private_data_handler); int acpi_bus_attach_private_data(acpi_handle handle, void *data) { acpi_status status; status = acpi_attach_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "Error attaching device data\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_attach_private_data); int acpi_bus_get_private_data(acpi_handle handle, void **data) { acpi_status status; if (!data) return -EINVAL; status = acpi_get_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "No context for object\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_get_private_data); void acpi_bus_detach_private_data(acpi_handle handle) { acpi_detach_data(handle, acpi_bus_private_data_handler); } EXPORT_SYMBOL_GPL(acpi_bus_detach_private_data); static void acpi_print_osc_error(acpi_handle handle, struct acpi_osc_context *context, char *error) { int i; acpi_handle_debug(handle, "(%s): %s\n", context->uuid_str, error); pr_debug("_OSC request data:"); for (i = 0; i < context->cap.length; i += sizeof(u32)) pr_debug(" %x", *((u32 *)(context->cap.pointer + i))); pr_debug("\n"); } acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context) { acpi_status status; struct acpi_object_list input; union acpi_object in_params[4]; union acpi_object *out_obj; guid_t guid; u32 errors; struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; if (!context) return AE_ERROR; if (guid_parse(context->uuid_str, &guid)) return AE_ERROR; context->ret.length = ACPI_ALLOCATE_BUFFER; context->ret.pointer = NULL; /* Setting up input parameters */ input.count = 4; input.pointer = in_params; in_params[0].type = ACPI_TYPE_BUFFER; in_params[0].buffer.length = 16; in_params[0].buffer.pointer = (u8 *)&guid; in_params[1].type = ACPI_TYPE_INTEGER; in_params[1].integer.value = context->rev; in_params[2].type = ACPI_TYPE_INTEGER; in_params[2].integer.value = context->cap.length/sizeof(u32); in_params[3].type = ACPI_TYPE_BUFFER; in_params[3].buffer.length = context->cap.length; in_params[3].buffer.pointer = context->cap.pointer; status = acpi_evaluate_object(handle, "_OSC", &input, &output); if (ACPI_FAILURE(status)) return status; if (!output.length) return AE_NULL_OBJECT; out_obj = output.pointer; if (out_obj->type != ACPI_TYPE_BUFFER || out_obj->buffer.length != context->cap.length) { acpi_print_osc_error(handle, context, "_OSC evaluation returned wrong type"); status = AE_TYPE; goto out_kfree; } /* Need to ignore the bit0 in result code */ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); if (errors) { if (errors & OSC_REQUEST_ERROR) acpi_print_osc_error(handle, context, "_OSC request failed"); if (errors & OSC_INVALID_UUID_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid UUID"); if (errors & OSC_INVALID_REVISION_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid revision"); if (errors & OSC_CAPABILITIES_MASK_ERROR) { if (((u32 *)context->cap.pointer)[OSC_QUERY_DWORD] & OSC_QUERY_ENABLE) goto out_success; status = AE_SUPPORT; goto out_kfree; } status = AE_ERROR; goto out_kfree; } out_success: context->ret.length = out_obj->buffer.length; context->ret.pointer = kmemdup(out_obj->buffer.pointer, context->ret.length, GFP_KERNEL); if (!context->ret.pointer) { status = AE_NO_MEMORY; goto out_kfree; } status = AE_OK; out_kfree: kfree(output.pointer); return status; } EXPORT_SYMBOL(acpi_run_osc); bool osc_sb_apei_support_acked; /* * ACPI 6.0 Section 8.4.4.2 Idle State Coordination * OSPM supports platform coordinated low power idle(LPI) states */ bool osc_pc_lpi_support_confirmed; EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed); /* * ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities': * Starting with ACPI Specification 6.2, all _CPC registers can be in * PCC, System Memory, System IO, or Functional Fixed Hardware address * spaces. OSPM support for this more flexible register space scheme is * indicated by the “Flexible Address Space for CPPC Registers” _OSC bit. * * Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in: * - PCC or Functional Fixed Hardware address space if defined * - SystemMemory address space (NULL register) if not defined */ bool osc_cpc_flexible_adr_space_confirmed; EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed); /* * ACPI 6.4 Operating System Capabilities for USB. */ bool osc_sb_native_usb4_support_confirmed; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_support_confirmed); bool osc_sb_cppc2_support_acked; static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; static void acpi_bus_osc_negotiate_platform_control(void) { u32 capbuf[2], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_uuid_str, .rev = 1, .cap.length = 8, .cap.pointer = capbuf, }; acpi_handle handle; capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = OSC_SB_PR3_SUPPORT; /* _PR3 is in use */ if (IS_ENABLED(CONFIG_ACPI_PROCESSOR_AGGREGATOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PAD_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PROCESSOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PPC_OST_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_THERMAL)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_BATTERY)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_OVER_16_PSTATES_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GED_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PRMT)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_FFH)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FFH_OPR_SUPPORT; #ifdef CONFIG_ARM64 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_X86 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_ACPI_CPPC_LIB capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; #endif capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_FLEXIBLE_ADR_SPACE; if (IS_ENABLED(CONFIG_SCHED_MC_PRIO)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT; if (IS_ENABLED(CONFIG_USB4)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_NATIVE_USB4_SUPPORT; if (!ghes_disable) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length <= OSC_SUPPORT_DWORD) { kfree(context.ret.pointer); return; } /* * Now run _OSC again with query flag clear and with the caps * supported by both the OS and the platform. */ capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_SUPPORT_DWORD] = capbuf_ret[OSC_SUPPORT_DWORD]; kfree(context.ret.pointer); if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length > OSC_SUPPORT_DWORD) { #ifdef CONFIG_ACPI_CPPC_LIB osc_sb_cppc2_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPCV2_SUPPORT; #endif osc_sb_apei_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_APEI_SUPPORT; osc_pc_lpi_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT; osc_sb_native_usb4_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT; osc_cpc_flexible_adr_space_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPC_FLEXIBLE_ADR_SPACE; } kfree(context.ret.pointer); } /* * Native control of USB4 capabilities. If any of the tunneling bits is * set it means OS is in control and we use software based connection * manager. */ u32 osc_sb_native_usb4_control; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_control); static void acpi_bus_decode_usb_osc(const char *msg, u32 bits) { pr_info("%s USB3%c DisplayPort%c PCIe%c XDomain%c\n", msg, (bits & OSC_USB_USB3_TUNNELING) ? '+' : '-', (bits & OSC_USB_DP_TUNNELING) ? '+' : '-', (bits & OSC_USB_PCIE_TUNNELING) ? '+' : '-', (bits & OSC_USB_XDOMAIN) ? '+' : '-'); } static u8 sb_usb_uuid_str[] = "23A0D13A-26AB-486C-9C5F-0FFA525A575A"; static void acpi_bus_osc_negotiate_usb_control(void) { u32 capbuf[3], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_usb_uuid_str, .rev = 1, .cap.length = sizeof(capbuf), .cap.pointer = capbuf, }; acpi_handle handle; acpi_status status; u32 control; if (!osc_sb_native_usb4_support_confirmed) return; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; control = OSC_USB_USB3_TUNNELING | OSC_USB_DP_TUNNELING | OSC_USB_PCIE_TUNNELING | OSC_USB_XDOMAIN; /* * Run _OSC first with query bit set, trying to get control over * all tunneling. The platform can then clear out bits in the * control dword that it does not want to grant to the OS. */ capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = control; status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } /* * Run _OSC again now with query bit clear and the control dword * matching what the platform granted (which may not have all * the control bits set). */ capbuf_ret = context.ret.pointer; capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = capbuf_ret[OSC_CONTROL_DWORD]; kfree(context.ret.pointer); status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } osc_sb_native_usb4_control = control & acpi_osc_ctx_get_pci_control(&context); acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control); acpi_bus_decode_usb_osc("USB4 _OSC: OS controls", osc_sb_native_usb4_control); out_free: kfree(context.ret.pointer); } /* -------------------------------------------------------------------------- Notification Handling -------------------------------------------------------------------------- */ /** * acpi_bus_notify - Global system-level (0x00-0x7F) notifications handler * @handle: Target ACPI object. * @type: Notification type. * @data: Ignored. * * This only handles notifications related to device hotplug. */ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data) { struct acpi_device *adev; switch (type) { case ACPI_NOTIFY_BUS_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_BUS_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_WAKE: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_WAKE event\n"); return; case ACPI_NOTIFY_EJECT_REQUEST: acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK_LIGHT event\n"); /* TBD: Exactly what does 'light' mean? */ return; case ACPI_NOTIFY_FREQUENCY_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a frequency mismatch\n"); return; case ACPI_NOTIFY_BUS_MODE_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a bus mode mismatch\n"); return; case ACPI_NOTIFY_POWER_FAULT: acpi_handle_err(handle, "Device has suffered a power fault\n"); return; default: acpi_handle_debug(handle, "Unknown event type 0x%x\n", type); return; } adev = acpi_get_acpi_dev(handle); if (adev && ACPI_SUCCESS(acpi_hotplug_schedule(adev, type))) return; acpi_put_acpi_dev(adev); acpi_evaluate_ost(handle, type, ACPI_OST_SC_NON_SPECIFIC_FAILURE, NULL); } static void acpi_notify_device(acpi_handle handle, u32 event, void *data) { struct acpi_device *device = data; struct acpi_driver *acpi_drv = to_acpi_driver(device->dev.driver); acpi_drv->ops.notify(device, event); } static int acpi_device_install_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_status status; status = acpi_install_notify_handler(device->handle, type, acpi_notify_device, device); if (ACPI_FAILURE(status)) return -EINVAL; return 0; } static void acpi_device_remove_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_remove_notify_handler(device->handle, type, acpi_notify_device); acpi_os_wait_events_complete(); } int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler, void *context) { acpi_status status; status = acpi_install_notify_handler(adev->handle, handler_type, handler, context); if (ACPI_FAILURE(status)) return -ENODEV; return 0; } EXPORT_SYMBOL_GPL(acpi_dev_install_notify_handler); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler) { acpi_remove_notify_handler(adev->handle, handler_type, handler); acpi_os_wait_events_complete(); } EXPORT_SYMBOL_GPL(acpi_dev_remove_notify_handler); /* Handle events targeting \_SB device (at present only graceful shutdown) */ #define ACPI_SB_NOTIFY_SHUTDOWN_REQUEST 0x81 #define ACPI_SB_INDICATE_INTERVAL 10000 static void sb_notify_work(struct work_struct *dummy) { acpi_handle sb_handle; orderly_poweroff(true); /* * After initiating graceful shutdown, the ACPI spec requires OSPM * to evaluate _OST method once every 10seconds to indicate that * the shutdown is in progress */ acpi_get_handle(NULL, "\\_SB", &sb_handle); while (1) { pr_info("Graceful shutdown in progress.\n"); acpi_evaluate_ost(sb_handle, ACPI_OST_EC_OSPM_SHUTDOWN, ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS, NULL); msleep(ACPI_SB_INDICATE_INTERVAL); } } static void acpi_sb_notify(acpi_handle handle, u32 event, void *data) { static DECLARE_WORK(acpi_sb_work, sb_notify_work); if (event == ACPI_SB_NOTIFY_SHUTDOWN_REQUEST) { if (!work_busy(&acpi_sb_work)) schedule_work(&acpi_sb_work); } else { pr_warn("event %x is not supported by \\_SB device\n", event); } } static int __init acpi_setup_sb_notify_handler(void) { acpi_handle sb_handle; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &sb_handle))) return -ENXIO; if (ACPI_FAILURE(acpi_install_notify_handler(sb_handle, ACPI_DEVICE_NOTIFY, acpi_sb_notify, NULL))) return -EINVAL; return 0; } /* -------------------------------------------------------------------------- Device Matching -------------------------------------------------------------------------- */ /** * acpi_get_first_physical_node - Get first physical node of an ACPI device * @adev: ACPI device in question * * Return: First physical node of ACPI device @adev */ struct device *acpi_get_first_physical_node(struct acpi_device *adev) { struct mutex *physical_node_lock = &adev->physical_node_lock; struct device *phys_dev; mutex_lock(physical_node_lock); if (list_empty(&adev->physical_node_list)) { phys_dev = NULL; } else { const struct acpi_device_physical_node *node; node = list_first_entry(&adev->physical_node_list, struct acpi_device_physical_node, node); phys_dev = node->dev; } mutex_unlock(physical_node_lock); return phys_dev; } EXPORT_SYMBOL_GPL(acpi_get_first_physical_node); static struct acpi_device *acpi_primary_dev_companion(struct acpi_device *adev, const struct device *dev) { const struct device *phys_dev = acpi_get_first_physical_node(adev); return phys_dev && phys_dev == dev ? adev : NULL; } /** * acpi_device_is_first_physical_node - Is given dev first physical node * @adev: ACPI companion device * @dev: Physical device to check * * Function checks if given @dev is the first physical devices attached to * the ACPI companion device. This distinction is needed in some cases * where the same companion device is shared between many physical devices. * * Note that the caller have to provide valid @adev pointer. */ bool acpi_device_is_first_physical_node(struct acpi_device *adev, const struct device *dev) { return !!acpi_primary_dev_companion(adev, dev); } /* * acpi_companion_match() - Can we match via ACPI companion device * @dev: Device in question * * Check if the given device has an ACPI companion and if that companion has * a valid list of PNP IDs, and if the device is the first (primary) physical * device associated with it. Return the companion pointer if that's the case * or NULL otherwise. * * If multiple physical devices are attached to a single ACPI companion, we need * to be careful. The usage scenario for this kind of relationship is that all * of the physical devices in question use resources provided by the ACPI * companion. A typical case is an MFD device where all the sub-devices share * the parent's ACPI companion. In such cases we can only allow the primary * (first) physical device to be matched with the help of the companion's PNP * IDs. * * Additional physical devices sharing the ACPI companion can still use * resources available from it but they will be matched normally using functions * provided by their bus types (and analogously for their modalias). */ const struct acpi_device *acpi_companion_match(const struct device *dev) { struct acpi_device *adev; adev = ACPI_COMPANION(dev); if (!adev) return NULL; if (list_empty(&adev->pnp.ids)) return NULL; return acpi_primary_dev_companion(adev, dev); } /** * acpi_of_match_device - Match device object using the "compatible" property. * @adev: ACPI device object to match. * @of_match_table: List of device IDs to match against. * @of_id: OF ID if matched * * If @dev has an ACPI companion which has ACPI_DT_NAMESPACE_HID in its list of * identifiers and a _DSD object with the "compatible" property, use that * property to match against the given list of identifiers. */ static bool acpi_of_match_device(const struct acpi_device *adev, const struct of_device_id *of_match_table, const struct of_device_id **of_id) { const union acpi_object *of_compatible, *obj; int i, nval; if (!adev) return false; of_compatible = adev->data.of_compatible; if (!of_match_table || !of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) { nval = of_compatible->package.count; obj = of_compatible->package.elements; } else { /* Must be ACPI_TYPE_STRING. */ nval = 1; obj = of_compatible; } /* Now we can look for the driver DT compatible strings */ for (i = 0; i < nval; i++, obj++) { const struct of_device_id *id; for (id = of_match_table; id->compatible[0]; id++) if (!strcasecmp(obj->string.pointer, id->compatible)) { if (of_id) *of_id = id; return true; } } return false; } static bool acpi_of_modalias(struct acpi_device *adev, char *modalias, size_t len) { const union acpi_object *of_compatible; const union acpi_object *obj; const char *str, *chr; of_compatible = adev->data.of_compatible; if (!of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) obj = of_compatible->package.elements; else /* Must be ACPI_TYPE_STRING. */ obj = of_compatible; str = obj->string.pointer; chr = strchr(str, ','); strscpy(modalias, chr ? chr + 1 : str, len); return true; } /** * acpi_set_modalias - Set modalias using "compatible" property or supplied ID * @adev: ACPI device object to match * @default_id: ID string to use as default if no compatible string found * @modalias: Pointer to buffer that modalias value will be copied into * @len: Length of modalias buffer * * This is a counterpart of of_alias_from_compatible() for struct acpi_device * objects. If there is a compatible string for @adev, it will be copied to * @modalias with the vendor prefix stripped; otherwise, @default_id will be * used. */ void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len) { if (!acpi_of_modalias(adev, modalias, len)) strscpy(modalias, default_id, len); } EXPORT_SYMBOL_GPL(acpi_set_modalias); static bool __acpi_match_device_cls(const struct acpi_device_id *id, struct acpi_hardware_id *hwid) { int i, msk, byte_shift; char buf[3]; if (!id->cls) return false; /* Apply class-code bitmask, before checking each class-code byte */ for (i = 1; i <= 3; i++) { byte_shift = 8 * (3 - i); msk = (id->cls_msk >> byte_shift) & 0xFF; if (!msk) continue; sprintf(buf, "%02x", (id->cls >> byte_shift) & msk); if (strncmp(buf, &hwid->id[(i - 1) * 2], 2)) return false; } return true; } static bool __acpi_match_device(const struct acpi_device *device, const struct acpi_device_id *acpi_ids, const struct of_device_id *of_ids, const struct acpi_device_id **acpi_id, const struct of_device_id **of_id) { const struct acpi_device_id *id; struct acpi_hardware_id *hwid; /* * If the device is not present, it is unnecessary to load device * driver for it. */ if (!device || !device->status.present) return false; list_for_each_entry(hwid, &device->pnp.ids, list) { /* First, check the ACPI/PNP IDs provided by the caller. */ if (acpi_ids) { for (id = acpi_ids; id->id[0] || id->cls; id++) { if (id->id[0] && !strcmp((char *)id->id, hwid->id)) goto out_acpi_match; if (id->cls && __acpi_match_device_cls(id, hwid)) goto out_acpi_match; } } /* * Next, check ACPI_DT_NAMESPACE_HID and try to match the * "compatible" property if found. */ if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id)) return acpi_of_match_device(device, of_ids, of_id); } return false; out_acpi_match: if (acpi_id) *acpi_id = id; return true; } /** * acpi_match_acpi_device - Match an ACPI device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id objects to match against. * @adev: The ACPI device pointer to match. * * Match the ACPI device @adev against a given list of ACPI IDs @ids. * * Return: * a pointer to the first matching ACPI ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev) { const struct acpi_device_id *id = NULL; __acpi_match_device(adev, ids, NULL, &id, NULL); return id; } EXPORT_SYMBOL_GPL(acpi_match_acpi_device); /** * acpi_match_device - Match a struct device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id object to match against. * @dev: The device structure to match. * * Check if @dev has a valid ACPI handle and if there is a struct acpi_device * object for that handle and use that object to match against a given list of * device IDs. * * Return a pointer to the first matching ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev) { return acpi_match_acpi_device(ids, acpi_companion_match(dev)); } EXPORT_SYMBOL_GPL(acpi_match_device); static const void *acpi_of_device_get_match_data(const struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); const struct of_device_id *match = NULL; if (!acpi_of_match_device(adev, dev->driver->of_match_table, &match)) return NULL; return match->data; } const void *acpi_device_get_match_data(const struct device *dev) { const struct acpi_device_id *acpi_ids = dev->driver->acpi_match_table; const struct acpi_device_id *match; if (!acpi_ids) return acpi_of_device_get_match_data(dev); match = acpi_match_device(acpi_ids, dev); if (!match) return NULL; return (const void *)match->driver_data; } EXPORT_SYMBOL_GPL(acpi_device_get_match_data); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids) { return __acpi_match_device(device, ids, NULL, NULL, NULL) ? 0 : -ENOENT; } EXPORT_SYMBOL(acpi_match_device_ids); bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { const struct acpi_device_id *acpi_ids = drv->acpi_match_table; const struct of_device_id *of_ids = drv->of_match_table; if (!acpi_ids) return acpi_of_match_device(ACPI_COMPANION(dev), of_ids, NULL); return __acpi_match_device(acpi_companion_match(dev), acpi_ids, of_ids, NULL, NULL); } EXPORT_SYMBOL_GPL(acpi_driver_match_device); /* -------------------------------------------------------------------------- ACPI Driver Management -------------------------------------------------------------------------- */ /** * __acpi_bus_register_driver - register a driver with the ACPI bus * @driver: driver being registered * @owner: owning module/driver * * Registers a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and binds. Returns zero for * success or a negative error status for failure. */ int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner) { if (acpi_disabled) return -ENODEV; driver->drv.name = driver->name; driver->drv.bus = &acpi_bus_type; driver->drv.owner = owner; return driver_register(&driver->drv); } EXPORT_SYMBOL(__acpi_bus_register_driver); /** * acpi_bus_unregister_driver - unregisters a driver with the ACPI bus * @driver: driver to unregister * * Unregisters a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and unbinds. */ void acpi_bus_unregister_driver(struct acpi_driver *driver) { driver_unregister(&driver->drv); } EXPORT_SYMBOL(acpi_bus_unregister_driver); /* -------------------------------------------------------------------------- ACPI Bus operations -------------------------------------------------------------------------- */ static int acpi_bus_match(struct device *dev, const struct device_driver *drv) { struct acpi_device *acpi_dev = to_acpi_device(dev); const struct acpi_driver *acpi_drv = to_acpi_driver(drv); return acpi_dev->flags.match_driver && !acpi_match_device_ids(acpi_dev, acpi_drv->ids); } static int acpi_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { return __acpi_device_uevent_modalias(to_acpi_device(dev), env); } static int acpi_device_probe(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); int ret; if (acpi_dev->handler && !acpi_is_pnp_device(acpi_dev)) return -EINVAL; if (!acpi_drv->ops.add) return -ENOSYS; ret = acpi_drv->ops.add(acpi_dev); if (ret) { acpi_dev->driver_data = NULL; return ret; } pr_debug("Driver [%s] successfully bound to device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); if (acpi_drv->ops.notify) { ret = acpi_device_install_notify_handler(acpi_dev, acpi_drv); if (ret) { if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; return ret; } } pr_debug("Found driver [%s] for device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); get_device(dev); return 0; } static void acpi_device_remove(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); if (acpi_drv->ops.notify) acpi_device_remove_notify_handler(acpi_dev, acpi_drv); if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; put_device(dev); } const struct bus_type acpi_bus_type = { .name = "acpi", .match = acpi_bus_match, .probe = acpi_device_probe, .remove = acpi_device_remove, .uevent = acpi_device_uevent, }; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data) { return bus_for_each_dev(&acpi_bus_type, NULL, data, fn); } EXPORT_SYMBOL_GPL(acpi_bus_for_each_dev); struct acpi_dev_walk_context { int (*fn)(struct acpi_device *, void *); void *data; }; static int acpi_dev_for_one_check(struct device *dev, void *context) { struct acpi_dev_walk_context *adwc = context; if (dev->bus != &acpi_bus_type) return 0; return adwc->fn(to_acpi_device(dev), adwc->data); } EXPORT_SYMBOL_GPL(acpi_dev_for_each_child); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child(&adev->dev, &adwc, acpi_dev_for_one_check); } int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child_reverse(&adev->dev, &adwc, acpi_dev_for_one_check); } /* -------------------------------------------------------------------------- Initialization/Cleanup -------------------------------------------------------------------------- */ static int __init acpi_bus_init_irq(void) { acpi_status status; char *message = NULL; /* * Let the system know what interrupt model we are using by * evaluating the \_PIC object, if exists. */ switch (acpi_irq_model) { case ACPI_IRQ_MODEL_PIC: message = "PIC"; break; case ACPI_IRQ_MODEL_IOAPIC: message = "IOAPIC"; break; case ACPI_IRQ_MODEL_IOSAPIC: message = "IOSAPIC"; break; case ACPI_IRQ_MODEL_GIC: message = "GIC"; break; case ACPI_IRQ_MODEL_PLATFORM: message = "platform specific model"; break; case ACPI_IRQ_MODEL_LPIC: message = "LPIC"; break; case ACPI_IRQ_MODEL_RINTC: message = "RINTC"; break; default: pr_info("Unknown interrupt routing model\n"); return -ENODEV; } pr_info("Using %s for interrupt routing\n", message); status = acpi_execute_simple_method(NULL, "\\_PIC", acpi_irq_model); if (ACPI_FAILURE(status) && (status != AE_NOT_FOUND)) { pr_info("_PIC evaluation failed: %s\n", acpi_format_exception(status)); return -ENODEV; } return 0; } /** * acpi_early_init - Initialize ACPICA and populate the ACPI namespace. * * The ACPI tables are accessible after this, but the handling of events has not * been initialized and the global lock is not available yet, so AML should not * be executed at this point. * * Doing this before switching the EFI runtime services to virtual mode allows * the EfiBootServices memory to be freed slightly earlier on boot. */ void __init acpi_early_init(void) { acpi_status status; if (acpi_disabled) return; pr_info("Core revision %08x\n", ACPI_CA_VERSION); /* enable workarounds, unless strict ACPI spec. compliance */ if (!acpi_strict) acpi_gbl_enable_interpreter_slack = TRUE; acpi_permanent_mmap = true; #ifdef CONFIG_X86 /* * If the machine falls into the DMI check table, * DSDT will be copied to memory. * Note that calling dmi_check_system() here on other architectures * would not be OK because only x86 initializes dmi early enough. * Thankfully only x86 systems need such quirks for now. */ dmi_check_system(dsdt_dmi_table); #endif status = acpi_reallocate_root_table(); if (ACPI_FAILURE(status)) { pr_err("Unable to reallocate ACPI tables\n"); goto error0; } status = acpi_initialize_subsystem(); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize the ACPI Interpreter\n"); goto error0; } #ifdef CONFIG_X86 if (!acpi_ioapic) { /* compatible (0) means level (3) */ if (!(acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)) { acpi_sci_flags &= ~ACPI_MADT_TRIGGER_MASK; acpi_sci_flags |= ACPI_MADT_TRIGGER_LEVEL; } /* Set PIC-mode SCI trigger type */ acpi_pic_sci_set_trigger(acpi_gbl_FADT.sci_interrupt, (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2); } else { /* * now that acpi_gbl_FADT is initialized, * update it with result from INT_SRC_OVR parsing */ acpi_gbl_FADT.sci_interrupt = acpi_sci_override_gsi; } #endif return; error0: disable_acpi(); } /** * acpi_subsystem_init - Finalize the early initialization of ACPI. * * Switch over the platform to the ACPI mode (if possible). * * Doing this too early is generally unsafe, but at the same time it needs to be * done before all things that really depend on ACPI. The right spot appears to * be before finalizing the EFI initialization. */ void __init acpi_subsystem_init(void) { acpi_status status; if (acpi_disabled) return; status = acpi_enable_subsystem(~ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to enable ACPI\n"); disable_acpi(); } else { /* * If the system is using ACPI then we can be reasonably * confident that any regulators are managed by the firmware * so tell the regulator core it has everything it needs to * know. */ regulator_has_full_constraints(); } } static acpi_status acpi_bus_table_handler(u32 event, void *table, void *context) { if (event == ACPI_TABLE_EVENT_LOAD) acpi_scan_table_notify(); return acpi_sysfs_table_handler(event, table, context); } static int __init acpi_bus_init(void) { int result; acpi_status status; acpi_os_initialize1(); status = acpi_load_tables(); if (ACPI_FAILURE(status)) { pr_err("Unable to load the System Description Tables\n"); goto error1; } /* * ACPI 2.0 requires the EC driver to be loaded and work before the EC * device is found in the namespace. * * This is accomplished by looking for the ECDT table and getting the EC * parameters out of that. * * Do that before calling acpi_initialize_objects() which may trigger EC * address space accesses. */ acpi_ec_ecdt_probe(); status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to start the ACPI Interpreter\n"); goto error1; } status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize ACPI objects\n"); goto error1; } /* * _OSC method may exist in module level code, * so it must be run after ACPI_FULL_INITIALIZATION */ acpi_bus_osc_negotiate_platform_control(); acpi_bus_osc_negotiate_usb_control(); /* * _PDC control method may load dynamic SSDT tables, * and we need to install the table handler before that. */ status = acpi_install_table_handler(acpi_bus_table_handler, NULL); acpi_sysfs_init(); acpi_early_processor_control_setup(); /* * Maybe EC region is required at bus_scan/acpi_get_devices. So it * is necessary to enable it as early as possible. */ acpi_ec_dsdt_probe(); pr_info("Interpreter enabled\n"); /* Initialize sleep structures */ acpi_sleep_init(); /* * Get the system interrupt model and evaluate \_PIC. */ result = acpi_bus_init_irq(); if (result) goto error1; /* * Register the for all standard device notifications. */ status = acpi_install_notify_handler(ACPI_ROOT_OBJECT, ACPI_SYSTEM_NOTIFY, &acpi_bus_notify, NULL); if (ACPI_FAILURE(status)) { pr_err("Unable to register for system notifications\n"); goto error1; } /* * Create the top ACPI proc directory */ acpi_root_dir = proc_mkdir(ACPI_BUS_FILE_ROOT, NULL); result = bus_register(&acpi_bus_type); if (!result) return 0; /* Mimic structured exception handling */ error1: acpi_terminate(); return -ENODEV; } struct kobject *acpi_kobj; EXPORT_SYMBOL_GPL(acpi_kobj); void __weak __init acpi_arch_init(void) { } static int __init acpi_init(void) { int result; if (acpi_disabled) { pr_info("Interpreter disabled.\n"); return -ENODEV; } acpi_kobj = kobject_create_and_add("acpi", firmware_kobj); if (!acpi_kobj) pr_debug("%s: kset create error\n", __func__); init_prmt(); acpi_init_pcc(); result = acpi_bus_init(); if (result) { kobject_put(acpi_kobj); disable_acpi(); return result; } acpi_init_ffh(); pci_mmcfg_late_init(); acpi_viot_early_init(); acpi_hest_init(); acpi_ghes_init(); acpi_arch_init(); acpi_scan_init(); acpi_ec_init(); acpi_debugfs_init(); acpi_sleep_proc_init(); acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); acpi_viot_init(); return 0; } subsys_initcall(acpi_init);
4 7 7 7 7 4 4 6 4 3 3 4 4 4 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-dvb.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file contains functions for initializing and handling the * linux-dvb API. */ #include "dvb-usb-common.h" #include <media/media-device.h> /* does the complete input transfer handling */ static int dvb_usb_ctrl_feed(struct dvb_demux_feed *dvbdmxfeed, int onoff) { struct dvb_usb_adapter *adap = dvbdmxfeed->demux->priv; int newfeedcount, ret; if (adap == NULL) return -ENODEV; if ((adap->active_fe < 0) || (adap->active_fe >= adap->num_frontends_initialized)) { return -EINVAL; } newfeedcount = adap->feedcount + (onoff ? 1 : -1); /* stop feed before setting a new pid if there will be no pid anymore */ if (newfeedcount == 0) { deb_ts("stop feeding\n"); usb_urb_kill(&adap->fe_adap[adap->active_fe].stream); if (adap->props.fe[adap->active_fe].streaming_ctrl != NULL) { ret = adap->props.fe[adap->active_fe].streaming_ctrl(adap, 0); if (ret < 0) { err("error while stopping stream."); return ret; } } } adap->feedcount = newfeedcount; /* activate the pid on the device specific pid_filter */ deb_ts("setting pid (%s): %5d %04x at index %d '%s'\n", adap->fe_adap[adap->active_fe].pid_filtering ? "yes" : "no", dvbdmxfeed->pid, dvbdmxfeed->pid, dvbdmxfeed->index, onoff ? "on" : "off"); if (adap->props.fe[adap->active_fe].caps & DVB_USB_ADAP_HAS_PID_FILTER && adap->fe_adap[adap->active_fe].pid_filtering && adap->props.fe[adap->active_fe].pid_filter != NULL) adap->props.fe[adap->active_fe].pid_filter(adap, dvbdmxfeed->index, dvbdmxfeed->pid, onoff); /* start the feed if this was the first feed and there is still a feed * for reception. */ if (adap->feedcount == onoff && adap->feedcount > 0) { deb_ts("controlling pid parser\n"); if (adap->props.fe[adap->active_fe].caps & DVB_USB_ADAP_HAS_PID_FILTER && adap->props.fe[adap->active_fe].caps & DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF && adap->props.fe[adap->active_fe].pid_filter_ctrl != NULL) { ret = adap->props.fe[adap->active_fe].pid_filter_ctrl(adap, adap->fe_adap[adap->active_fe].pid_filtering); if (ret < 0) { err("could not handle pid_parser"); return ret; } } deb_ts("start feeding\n"); if (adap->props.fe[adap->active_fe].streaming_ctrl != NULL) { ret = adap->props.fe[adap->active_fe].streaming_ctrl(adap, 1); if (ret < 0) { err("error while enabling fifo."); return ret; } } deb_ts("submitting all URBs\n"); usb_urb_submit(&adap->fe_adap[adap->active_fe].stream); } return 0; } static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed) { deb_ts("start pid: 0x%04x, feedtype: %d\n", dvbdmxfeed->pid, dvbdmxfeed->type); return dvb_usb_ctrl_feed(dvbdmxfeed, 1); } static int dvb_usb_stop_feed(struct dvb_demux_feed *dvbdmxfeed) { deb_ts("stop pid: 0x%04x, feedtype: %d\n", dvbdmxfeed->pid, dvbdmxfeed->type); return dvb_usb_ctrl_feed(dvbdmxfeed, 0); } static int dvb_usb_media_device_init(struct dvb_usb_adapter *adap) { #ifdef CONFIG_MEDIA_CONTROLLER_DVB struct media_device *mdev; struct dvb_usb_device *d = adap->dev; struct usb_device *udev = d->udev; mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return -ENOMEM; media_device_usb_init(mdev, udev, d->desc->name); dvb_register_media_controller(&adap->dvb_adap, mdev); dev_info(&d->udev->dev, "media controller created\n"); #endif return 0; } static int dvb_usb_media_device_register(struct dvb_usb_adapter *adap) { #ifdef CONFIG_MEDIA_CONTROLLER_DVB return media_device_register(adap->dvb_adap.mdev); #else return 0; #endif } static void dvb_usb_media_device_unregister(struct dvb_usb_adapter *adap) { #ifdef CONFIG_MEDIA_CONTROLLER_DVB if (!adap->dvb_adap.mdev) return; mutex_lock(&adap->dvb_adap.mdev_lock); media_device_unregister(adap->dvb_adap.mdev); media_device_cleanup(adap->dvb_adap.mdev); kfree(adap->dvb_adap.mdev); adap->dvb_adap.mdev = NULL; mutex_unlock(&adap->dvb_adap.mdev_lock); #endif } int dvb_usb_adapter_dvb_init(struct dvb_usb_adapter *adap, short *adapter_nums) { int i; int ret = dvb_register_adapter(&adap->dvb_adap, adap->dev->desc->name, adap->dev->owner, &adap->dev->udev->dev, adapter_nums); if (ret < 0) { deb_info("dvb_register_adapter failed: error %d", ret); goto err; } adap->dvb_adap.priv = adap; ret = dvb_usb_media_device_init(adap); if (ret < 0) { deb_info("dvb_usb_media_device_init failed: error %d", ret); goto err_mc; } if (adap->dev->props.read_mac_address) { if (adap->dev->props.read_mac_address(adap->dev, adap->dvb_adap.proposed_mac) == 0) info("MAC address: %pM", adap->dvb_adap.proposed_mac); else err("MAC address reading failed."); } adap->demux.dmx.capabilities = DMX_TS_FILTERING | DMX_SECTION_FILTERING; adap->demux.priv = adap; adap->demux.filternum = 0; for (i = 0; i < adap->props.num_frontends; i++) { if (adap->demux.filternum < adap->fe_adap[i].max_feed_count) adap->demux.filternum = adap->fe_adap[i].max_feed_count; } adap->demux.feednum = adap->demux.filternum; adap->demux.start_feed = dvb_usb_start_feed; adap->demux.stop_feed = dvb_usb_stop_feed; adap->demux.write_to_decoder = NULL; if ((ret = dvb_dmx_init(&adap->demux)) < 0) { err("dvb_dmx_init failed: error %d", ret); goto err_dmx; } adap->dmxdev.filternum = adap->demux.filternum; adap->dmxdev.demux = &adap->demux.dmx; adap->dmxdev.capabilities = 0; if ((ret = dvb_dmxdev_init(&adap->dmxdev, &adap->dvb_adap)) < 0) { err("dvb_dmxdev_init failed: error %d", ret); goto err_dmx_dev; } if ((ret = dvb_net_init(&adap->dvb_adap, &adap->dvb_net, &adap->demux.dmx)) < 0) { err("dvb_net_init failed: error %d", ret); goto err_net_init; } adap->state |= DVB_USB_ADAP_STATE_DVB; return 0; err_net_init: dvb_dmxdev_release(&adap->dmxdev); err_dmx_dev: dvb_dmx_release(&adap->demux); err_dmx: dvb_usb_media_device_unregister(adap); err_mc: dvb_unregister_adapter(&adap->dvb_adap); err: return ret; } int dvb_usb_adapter_dvb_exit(struct dvb_usb_adapter *adap) { if (adap->state & DVB_USB_ADAP_STATE_DVB) { deb_info("unregistering DVB part\n"); dvb_net_release(&adap->dvb_net); adap->demux.dmx.close(&adap->demux.dmx); dvb_dmxdev_release(&adap->dmxdev); dvb_dmx_release(&adap->demux); dvb_usb_media_device_unregister(adap); dvb_unregister_adapter(&adap->dvb_adap); adap->state &= ~DVB_USB_ADAP_STATE_DVB; } return 0; } static int dvb_usb_set_active_fe(struct dvb_frontend *fe, int onoff) { struct dvb_usb_adapter *adap = fe->dvb->priv; int ret = (adap->props.frontend_ctrl) ? adap->props.frontend_ctrl(fe, onoff) : 0; if (ret < 0) { err("frontend_ctrl request failed"); return ret; } if (onoff) adap->active_fe = fe->id; return 0; } static int dvb_usb_fe_wakeup(struct dvb_frontend *fe) { struct dvb_usb_adapter *adap = fe->dvb->priv; dvb_usb_device_power_ctrl(adap->dev, 1); dvb_usb_set_active_fe(fe, 1); if (adap->fe_adap[fe->id].fe_init) adap->fe_adap[fe->id].fe_init(fe); return 0; } static int dvb_usb_fe_sleep(struct dvb_frontend *fe) { struct dvb_usb_adapter *adap = fe->dvb->priv; if (adap->fe_adap[fe->id].fe_sleep) adap->fe_adap[fe->id].fe_sleep(fe); dvb_usb_set_active_fe(fe, 0); return dvb_usb_device_power_ctrl(adap->dev, 0); } int dvb_usb_adapter_frontend_init(struct dvb_usb_adapter *adap) { int ret, i; /* register all given adapter frontends */ for (i = 0; i < adap->props.num_frontends; i++) { if (adap->props.fe[i].frontend_attach == NULL) { err("strange: '%s' #%d,%d doesn't want to attach a frontend.", adap->dev->desc->name, adap->id, i); return 0; } ret = adap->props.fe[i].frontend_attach(adap); if (ret || adap->fe_adap[i].fe == NULL) { /* only print error when there is no FE at all */ if (i == 0) err("no frontend was attached by '%s'", adap->dev->desc->name); return 0; } adap->fe_adap[i].fe->id = i; /* re-assign sleep and wakeup functions */ adap->fe_adap[i].fe_init = adap->fe_adap[i].fe->ops.init; adap->fe_adap[i].fe->ops.init = dvb_usb_fe_wakeup; adap->fe_adap[i].fe_sleep = adap->fe_adap[i].fe->ops.sleep; adap->fe_adap[i].fe->ops.sleep = dvb_usb_fe_sleep; if (dvb_register_frontend(&adap->dvb_adap, adap->fe_adap[i].fe)) { err("Frontend %d registration failed.", i); dvb_frontend_detach(adap->fe_adap[i].fe); adap->fe_adap[i].fe = NULL; /* In error case, do not try register more FEs, * still leaving already registered FEs alive. */ if (i == 0) return -ENODEV; else return 0; } /* only attach the tuner if the demod is there */ if (adap->props.fe[i].tuner_attach != NULL) adap->props.fe[i].tuner_attach(adap); adap->num_frontends_initialized++; } ret = dvb_create_media_graph(&adap->dvb_adap, true); if (ret) return ret; ret = dvb_usb_media_device_register(adap); return ret; } int dvb_usb_adapter_frontend_exit(struct dvb_usb_adapter *adap) { int i = adap->num_frontends_initialized - 1; /* unregister all given adapter frontends */ for (; i >= 0; i--) { if (adap->fe_adap[i].fe != NULL) { dvb_unregister_frontend(adap->fe_adap[i].fe); dvb_frontend_detach(adap->fe_adap[i].fe); } } adap->num_frontends_initialized = 0; return 0; }
6 6 17 9 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* * Copyright 2011 Red Hat, Inc. * Copyright © 2014 The Chromium OS Authors * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software") * to deal in the software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * them Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTIBILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Adam Jackson <ajax@redhat.com> * Ben Widawsky <ben@bwidawsk.net> */ /* * This is vgem, a (non-hardware-backed) GEM service. This is used by Mesa's * software renderer and the X server for efficient buffer sharing. */ #include <linux/dma-buf.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/shmem_fs.h> #include <linux/vmalloc.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> #include "vgem_drv.h" #define DRIVER_NAME "vgem" #define DRIVER_DESC "Virtual GEM provider" #define DRIVER_DATE "20120112" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device; static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile; int ret; vfile = kzalloc(sizeof(*vfile), GFP_KERNEL); if (!vfile) return -ENOMEM; file->driver_priv = vfile; ret = vgem_fence_open(vfile); if (ret) { kfree(vfile); return ret; } return 0; } static void vgem_postclose(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile = file->driver_priv; vgem_fence_close(vfile); kfree(vfile); } static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), }; DEFINE_DRM_GEM_FOPS(vgem_driver_fops); static struct drm_gem_object *vgem_gem_create_object(struct drm_device *dev, size_t size) { struct drm_gem_shmem_object *obj; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return ERR_PTR(-ENOMEM); /* * vgem doesn't have any begin/end cpu access ioctls, therefore must use * coherent memory or dma-buf sharing just wont work. */ obj->map_wc = true; return &obj->base; } static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, .open = vgem_open, .postclose = vgem_postclose, .ioctls = vgem_ioctls, .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, .gem_create_object = vgem_gem_create_object, .name = DRIVER_NAME, .desc = DRIVER_DESC, .date = DRIVER_DATE, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int __init vgem_init(void) { int ret; struct platform_device *pdev; pdev = platform_device_register_simple("vgem", -1, NULL, 0); if (IS_ERR(pdev)) return PTR_ERR(pdev); if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL)) { ret = -ENOMEM; goto out_unregister; } dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); vgem_device = devm_drm_dev_alloc(&pdev->dev, &vgem_driver, struct vgem_device, drm); if (IS_ERR(vgem_device)) { ret = PTR_ERR(vgem_device); goto out_devres; } vgem_device->platform = pdev; /* Final step: expose the device/driver to userspace */ ret = drm_dev_register(&vgem_device->drm, 0); if (ret) goto out_devres; return 0; out_devres: devres_release_group(&pdev->dev, NULL); out_unregister: platform_device_unregister(pdev); return ret; } static void __exit vgem_exit(void) { struct platform_device *pdev = vgem_device->platform; drm_dev_unregister(&vgem_device->drm); devres_release_group(&pdev->dev, NULL); platform_device_unregister(pdev); } module_init(vgem_init); module_exit(vgem_exit); MODULE_AUTHOR("Red Hat, Inc."); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL and additional rights");
14 27 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H #include <linux/sizes.h> #include <linux/compiler_types.h> #include "ctree.h" #include "fs.h" struct block_device; struct super_block; struct extent_buffer; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info; struct btrfs_super_block; struct btrfs_trans_handle; struct btrfs_tree_parent_check; struct btrfs_transaction; #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 /* * Fixed blocksize for all devices, applies to specific ways of reading * metadata like superblock. Must meet the set_blocksize requirements. * * Do not change. */ #define BTRFS_BDEV_BLOCKSIZE (4096) static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; } void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); int btrfs_global_root_insert(struct btrfs_root *root); void btrfs_global_root_delete(struct btrfs_root *root); struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_validate_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. * * If you want to ensure the whole tree is safe, you should use * fs_info->subvol_srcu */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { if (!root) return NULL; if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); #endif
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2009 Oracle. All rights reserved. */ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H #include <linux/rbtree.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/freezer.h> #include "fs.h" struct inode; struct page; struct btrfs_fs_info; struct btrfs_path; struct btrfs_trans_handle; struct btrfs_trim_block_group; /* * This is the trim state of an extent or bitmap. * * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a * bitmap as we may need several trims to fully trim a single bitmap entry. * This is reset should any free space other than trimmed space be added to the * bitmap. */ enum btrfs_trim_state { BTRFS_TRIM_STATE_UNTRIMMED, BTRFS_TRIM_STATE_TRIMMED, BTRFS_TRIM_STATE_TRIMMING, }; struct btrfs_free_space { struct rb_node offset_index; struct rb_node bytes_index; u64 offset; u64 bytes; u64 max_extent_size; unsigned long *bitmap; struct list_head list; enum btrfs_trim_state trim_state; s32 bitmap_extents; }; static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info) { return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED); } static inline bool btrfs_free_space_trimming_bitmap( struct btrfs_free_space *info) { return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } static inline bool btrfs_trim_interrupted(void) { return fatal_signal_pending(current) || freezing(current); } /* * Deltas are an effective way to populate global statistics. Give macro names * to make it clear what we're doing. An example is discard_extents in * btrfs_free_space_ctl. */ enum { BTRFS_STAT_CURR, BTRFS_STAT_PREV, BTRFS_STAT_NR_ENTRIES, }; struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; struct rb_root_cached free_space_bytes; u64 free_space; int extents_thresh; int free_extents; int total_bitmaps; int unit; u64 start; s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; const struct btrfs_free_space_op *op; struct btrfs_block_group *block_group; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; }; struct btrfs_free_space_op { bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); }; struct btrfs_io_ctl { void *cur, *orig; struct page *page; struct page **pages; struct btrfs_fs_info *fs_info; struct inode *inode; unsigned long size; int index; int num_pages; int entries; int bitmaps; }; int __init btrfs_free_space_init(void); void __cold btrfs_free_space_exit(void); struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path); int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group); int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *inode); int load_free_space_cache(struct btrfs_block_group *block_group); int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes); int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size); void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, bool async); int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async); bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap); int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes); #endif #endif
11 190 63 4 4 67 68 382 5 35 35 35 35 1 470 34 34 48 49 246 246 94 93 271 179 94 468 467 338 27 271 2 82 73 9 339 338 72 312 338 6 473 472 476 59 59 59 60 60 59 60 60 60 71 72 161 161 161 159 161 72 72 71 72 72 379 381 381 381 425 20 424 423 13 8 8 8 8 8 8 8 13 13 8 8 2 2 6 473 1 1 2 1 1 1 1 470 469 289 4 1 1 472 338 292 293 291 82 287 468 471 337 285 474 100 1 471 472 476 4 1 1 3 4 3 5 12 19 19 13 2 1 20 19 2 20 20 16 16 2 2 7 64 74 176 176 176 176 175 73 74 176 104 74 73 74 190 119 11 64 61 59 189 34 189 189 190 104 71 4 176 191 190 191 65 65 64 106 150 378 382 381 379 380 8 157 158 43 41 471 470 13 470 7 397 283 11 41 41 41 41 7 7 7 7 7 6 61 61 30 29 2 2 2 2 2 28 28 28 28 7 7 52 53 52 11 11 11 11 61 3 59 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "smm.h" #include "kvm_emulate.h" #include "page_track.h" #include "cpuid.h" #include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> #include <linux/srcu.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> #include <linux/kstrtox.h> #include <linux/kthread.h> #include <linux/wordpart.h> #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/set_memory.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> #include "trace.h" static bool nx_hugepage_mitigation_hard_disabled; int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; #else static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { .set = set_nx_huge_pages_recovery_param, .get = param_get_uint, }; module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_period_ms, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); static bool __read_mostly force_flush_and_sync_on_reuse; module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: * 1. the guest-virtual to guest-physical * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging. */ bool tdp_enabled = false; static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); #endif static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; #define PTE_PREFETCH_NUM 8 #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ #define PTE_LIST_EXT 14 /* * struct pte_list_desc is the core data structure used to implement a custom * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a * given GFN when used in the context of rmaps. Using a custom list allows KVM * to optimize for the common case where many GFNs will have at most a handful * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small * memory footprint, which in turn improves runtime performance by exploiting * cache locality. * * A list is comprised of one or more pte_list_desc objects (descriptors). * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor * is full and a new SPTEs needs to be added, a new descriptor is allocated and * becomes the head of the list. This means that by definitions, all tail * descriptors are full. * * Note, the meta data fields are deliberately placed at the start of the * structure to optimize the cacheline layout; accessing the descriptor will * touch only a single cacheline so long as @spte_count<=6 (or if only the * descriptors metadata is accessed). */ struct pte_list_desc { struct pte_list_desc *more; /* The number of PTEs stored in _this_ descriptor. */ u32 spte_count; /* The number of PTEs stored in all tails of this descriptor. */ u32 tail_count; u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { u64 addr; hpa_t shadow_addr; u64 *sptep; int level; unsigned index; }; #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ (_root), (_addr)); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) #define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)) && \ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; static void mmu_spte_set(u64 *sptep, u64 spte); struct kvm_mmu_role_regs { const unsigned long cr0; const unsigned long cr4; const u64 efer; }; #define CREATE_TRACE_POINTS #include "mmutrace.h" /* * Yes, lot's of underscores. They're a hint that you probably shouldn't be * reading from the role_regs. Once the root_role is constructed, it becomes * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ static inline bool __maybe_unused \ ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \ { \ return !!(regs->reg & flag); \ } BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); /* * The MMU itself (with a valid role) is the single source of truth for the * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \ } BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); static inline bool is_cr0_pg(struct kvm_mmu *mmu) { return mmu->cpu_role.base.level > 0; } static inline bool is_cr4_pae(struct kvm_mmu *mmu) { return !mmu->cpu_role.base.has_4_byte_gpte; } static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = { .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), .efer = vcpu->arch.efer, }; return regs; } static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) return kvm_read_cr3(vcpu); return mmu->get_guest_pgd(vcpu); } static inline bool kvm_available_flush_remote_tlbs_range(void) { #if IS_ENABLED(CONFIG_HYPERV) return kvm_x86_ops.flush_remote_tlbs_range; #else return false; #endif } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); /* Flush the range of guest memory mapped by the given SPTE. */ static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 spte = make_mmio_spte(vcpu, gfn, access); trace_mark_mmio_spte(sptep, gfn, spte); mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) & shadow_nonpresent_or_rsvd_mask; return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { return spte & shadow_mmio_access_mask; } static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; gen = kvm_vcpu_memslots(vcpu)->generation; if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) return false; kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); return likely(kvm_gen == spte_gen); } static int is_cpuid_PSE36(void) { return 1; } #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } static u64 __get_spte_lockless(u64 *sptep) { return READ_ONCE(*sptep); } #else union split_spte { struct { u32 spte_low; u32 spte_high; }; u64 spte; }; static void count_spte_clear(u64 *sptep, u64 spte) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; /* Ensure the spte is completely set before we increase the count */ smp_wmb(); sp->clear_spte_count++; } static void __set_spte(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; ssptep->spte_high = sspte.spte_high; /* * If we map the spte from nonpresent to present, We should store * the high bits firstly, then set present bit, so cpu can not * fetch this spte while we are setting the spte. */ smp_wmb(); WRITE_ONCE(ssptep->spte_low, sspte.spte_low); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; WRITE_ONCE(ssptep->spte_low, sspte.spte_low); /* * If we map the spte from present to nonpresent, we should clear * present bit firstly to avoid vcpu fetch the old high bits. */ smp_wmb(); ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte, orig; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; /* xchg acts as a barrier before the setting of the high bits */ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); orig.spte_high = ssptep->spte_high; ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); return orig.spte; } /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because they are coalesced and * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value * for the high part of the spte. The race is fine for a present->non-present * change (because the high part of the spte is ignored for non-present spte), * but for a present->present change we must reread the spte. * * All such changes are done in two steps (present->non-present and * non-present->present), hence it is enough to count the number of * present->non-present updates: if it changed while reading the spte, * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; retry: count = sp->clear_spte_count; smp_rmb(); spte.spte_low = orig->spte_low; smp_rmb(); spte.spte_high = orig->spte_high; smp_rmb(); if (unlikely(spte.spte_low != orig->spte_low || count != sp->clear_spte_count)) goto retry; return spte.spte; } #endif /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present * or in a state where the hardware will not attempt to update * the spte. */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * * Returns true if the TLB needs to be flushed */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); return false; } if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; kvm_update_page_stats(kvm, level, -1); return old_spte; } /* * Rules for using mmu_spte_clear_no_track: * Directly clear spte without caring the state bits of sptep, * it is used to set the upper level spte. */ static void mmu_spte_clear_no_track(u64 *sptep) { __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) { return __get_spte_lockless(sptep); } static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; } static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* * Prevent page table teardown by making any free-er wait during * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* * Make sure the write to vcpu->mode is not reordered in front of * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) { r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; } return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } static bool sp_has_gptes(struct kvm_mmu_page *sp); static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } /* * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note * that the SPTE itself may have a more constrained access permissions that * what the guest enforces. For example, a guest may create an executable * huge PTE but KVM may disallow execution to mitigate iTLB multihit. */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, * KVM is not shadowing any guest page tables, so the "guest access * permissions" are just ACC_ALL. * * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM * is shadowing a guest huge page with small pages, the guest access * permissions being shadowed are the access permissions of the huge * page. * * In both cases, sp->role.access contains the correct access bits. */ return sp->role.access; } static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), "access mismatch under %s page %llx (expected %u, got %u)\n", sp->role.passthrough ? "passthrough" : "direct", sp->gfn, kvm_mmu_page_get_access(sp, index), access); WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", sp->role.passthrough ? "passthrough" : "direct", sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); } static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, unsigned int access) { gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); kvm_mmu_page_set_translation(sp, index, gfn, access); } /* * Return the pointer to the large page information for a given gfn, * handling slots that are not large page aligned. */ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, const struct kvm_memory_slot *slot, int level) { unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->arch.lpage_info[level - 2][idx]; } /* * The most significant bit in disallow_lpage tracks whether or not memory * attributes are mixed, i.e. not identical for all gfns at the current level. * The lower order bits are used to refcount other cases where a hugepage is * disallowed, e.g. if KVM has shadow a page table at the gfn. */ #define KVM_LPAGE_MIXED_FLAG BIT(31) static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); old = linfo->disallow_lpage; linfo->disallow_lpage += count; WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; gfn_t gfn; kvm->arch.indirect_shadow_pages++; /* * Ensure indirect_shadow_pages is elevated prior to re-reading guest * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight * emulated writes are visible before re-reading guest PTEs, or that * an emulated write will see the elevated count and acquire mmu_lock * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write(). */ smp_mb(); gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * If it's possible to replace the shadow page with an NX huge page, * i.e. if the shadow page is the only thing currently preventing KVM * from using a huge page, add the shadow page to the list of "to be * zapped for NX recovery" pages. Note, the shadow page can already be * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points. */ if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; list_add_tail(&sp->possible_nx_huge_page_link, &kvm->arch.possible_nx_huge_pages); } static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool nx_huge_page_possible) { sp->nx_huge_page_disallowed = true; if (nx_huge_page_possible) track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; gfn_t gfn; kvm->arch.indirect_shadow_pages--; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (list_empty(&sp->possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; list_del_init(&sp->possible_nx_huge_page_link); } static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); } static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL; return slot; } /* * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. */ #define KVM_RMAP_MANY BIT(0) /* * Returns the number of pointers in the rmap chain, not counting the new one. */ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int count = 0; if (!rmap_head->val) { rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* * If the previous head is full, allocate a new head descriptor * as tail descriptors are always kept full. */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; } desc->sptes[desc->spte_count++] = spte; } return count; } static void pte_list_desc_remove_entry(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* * The head descriptor should never be empty. A new head is added only * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty. */ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); /* * Replace the to-be-freed SPTE with the last valid entry from the head * descriptor to ensure that tail descriptors are full at all times. * Note, this also means that tail_count is stable for each descriptor. */ desc->sptes[i] = head_desc->sptes[j]; head_desc->sptes[j] = NULL; head_desc->spte_count--; if (head_desc->spte_count) return; /* * The head descriptor is empty. If there are no tail descriptors, * nullify the rmap head to mark the list as empty, else point the rmap * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) rmap_head->val = 0; else rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i; if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) return; if (!(rmap_head->val & KVM_RMAP_MANY)) { if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; rmap_head->val = 0; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(kvm, rmap_head, desc, i); return; } } desc = desc->more; } KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } } static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); pte_list_remove(kvm, sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; if (!rmap_head->val) return false; if (!(rmap_head->val & KVM_RMAP_MANY)) { mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) mmu_spte_clear_track_bits(kvm, desc->sptes[i]); next = desc->more; mmu_free_pte_list_desc(desc); } out: /* rmap_head is meaningless now, remember to reset it */ rmap_head->val = 0; return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; if (!rmap_head->val) return 0; else if (!(rmap_head->val & KVM_RMAP_MANY)) return 1; desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, const struct kvm_memory_slot *slot) { unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU * so we have to determine which memslots to use based on context * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); pte_list_remove(kvm, spte, rmap_head); } /* * Used by the following functions to iterate through the sptes linked by a * rmap. All fields are private and not assumed to be used outside. */ struct rmap_iterator { /* private fields */ struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; /* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise. */ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { u64 *sptep; if (!rmap_head->val) return NULL; if (!(rmap_head->val & KVM_RMAP_MANY)) { iter->desc = NULL; sptep = (u64 *)rmap_head->val; goto out; } iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: BUG_ON(!is_shadow_present_pte(*sptep)); return sptep; } /* * Must be used with a valid iterator: e.g. after rmap_get_first(). * * Returns sptep if found, NULL otherwise. */ static u64 *rmap_get_next(struct rmap_iterator *iter) { u64 *sptep; if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; sptep = iter->desc->sptes[iter->pos]; if (sptep) goto out; } iter->desc = iter->desc->more; if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ sptep = iter->desc->sptes[iter->pos]; goto out; } } return NULL; out: BUG_ON(!is_shadow_present_pte(*sptep)); return sptep; } #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ _spte_; _spte_ = rmap_get_next(_iter_)) static void drop_spte(struct kvm *kvm, u64 *sptep) { u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); if (flush) kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * * Return true if tlb need be flushed. */ static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) return false; if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); } static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_write_protect(sptep, pt_protect); return flush; } static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) if (spte_ad_need_write_protect(*sptep)) flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); return flush; } static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); if (!kvm_memslots_have_rmaps(kvm)) return; while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); rmap_write_protect(rmap_head, false); /* clear the first set bit */ mask &= mask - 1; } } static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); if (!kvm_memslots_have_rmaps(kvm)) return; while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; } } void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* * If the slot was assumed to be "initially all dirty", write-protect * huge pages to ensure they are split to 4KiB on the first write (KVM * dirty logs at 4KiB granularity). If eager page splitting is enabled, * immediately try to split huge pages, e.g. so that vCPUs don't get * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large * pages. */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); if (READ_ONCE(eager_page_split)) kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K); kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); /* Cross two large pages? */ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != ALIGN(end << PAGE_SHIFT, PMD_SIZE)) kvm_mmu_slot_gfn_write_protect(kvm, slot, end, PG_LEVEL_2M); } /* * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in * mask. If PML is enabled and the GFN doesn't need to be write- * protected for other reasons, e.g. shadow paging, clear the Dirty bit. * Otherwise clear the Writable bit. * * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } int kvm_cpu_dirty_log_size(void) { return kvm_x86_ops.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level) { struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= rmap_write_protect(rmap_head, true); } } if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); return write_protected; } static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { return kvm_zap_all_rmap_sptes(kvm, rmap_head); } struct slot_rmap_walk_iterator { /* input fields. */ const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; int end_level; /* output fields. */ gfn_t gfn; struct kvm_rmap_head *rmap; int level; /* private field. */ struct kvm_rmap_head *end_rmap; }; static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; iterator->start_level = start_level; iterator->end_level = end_level; iterator->start_gfn = start_gfn; iterator->end_gfn = end_gfn; rmap_walk_init_level(iterator, iterator->start_level); } static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) { return !!iterator->rmap; } static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); if (iterator->rmap->val) return; } if (++iterator->level > iterator->end_level) { iterator->rmap = NULL; return; } rmap_walk_init_level(iterator, iterator->level); } #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ _start_gfn, _end_gfn, _iter_) \ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ _end_level_, _start_gfn, _end_gfn); \ slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot); static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool can_yield, bool flush_on_yield, bool flush) { struct slot_rmap_walk_iterator iterator; lockdep_assert_held_write(&kvm->mmu_lock); for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) flush |= fn(kvm, iterator.rmap, slot); if (!can_yield) continue; if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { kvm_flush_remote_tlbs_range(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } cond_resched_rwlock_write(&kvm->mmu_lock); } } return flush; } static __always_inline bool walk_slot_rmaps(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, int start_level, int end_level, bool flush_on_yield) { return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, slot->base_gfn, slot->base_gfn + slot->npages - 1, true, flush_on_yield, false); } static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, bool flush_on_yield) { return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, bool can_yield, bool flush) { return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, can_yield, true, flush); } bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; /* * To prevent races with vCPUs faulting in a gfn using stale data, * zapping a gfn range must be protected by mmu_invalidate_in_progress * (and mmu_invalidate_seq). The only exception is memslot deletion; * in that case, SRCU synchronization ensures that SPTEs are zapped * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the * invalid slot. */ lockdep_assert_once(kvm->mmu_invalidate_in_progress || lockdep_is_held(&kvm->slots_lock)); if (kvm_memslots_have_rmaps(kvm)) flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, range->start, range->end, range->may_block, flush); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); if (kvm_x86_ops.set_apic_access_page_addr && range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); return flush; } #define RMAP_RECYCLE_THRESHOLD 1000 static void __rmap_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } } static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool test_only) { struct slot_rmap_walk_iterator iterator; struct rmap_iterator iter; bool young = false; u64 *sptep; for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) { for_each_rmap_spte(iterator.rmap, &iter, sptep) { u64 spte = *sptep; if (!is_accessed_spte(spte)) continue; if (test_only) return true; if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { /* * WARN if mmu_spte_update() signals the need * for a TLB flush, as Access tracking a SPTE * should never trigger an _immediate_ flush. */ spte = mark_spte_for_access_track(spte); WARN_ON_ONCE(mmu_spte_update(sptep, spte)); } young = true; } } return young; } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) young = kvm_rmap_age_gfn_range(kvm, range, false); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) young = kvm_rmap_age_gfn_range(kvm, range, true); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; } static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { #ifdef CONFIG_KVM_PROVE_MMU int i; for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", sp->spt[i], &sp->spt[i], kvm_mmu_page_get_gfn(sp, i)); } #endif } static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm->arch.n_used_mmu_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm->arch.n_used_mmu_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { kvm_mmu_check_sptes_at_free(sp); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } static unsigned kvm_page_table_hashfn(gfn_t gfn) { return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { mark_unsync(sptep); } } static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; kvm_mmu_mark_parents_unsync(sp); } #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { struct mmu_page_and_offset { struct kvm_mmu_page *sp; unsigned int idx; } page[KVM_PAGE_ARRAY_NR]; unsigned int nr; }; static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { int i; if (sp->unsync) for (i=0; i < pvec->nr; i++) if (pvec->page[i].sp == sp) return 0; pvec->page[pvec->nr].sp = sp; pvec->page[pvec->nr].idx = idx; pvec->nr++; return (pvec->nr == KVM_PAGE_ARRAY_NR); } static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { int i, ret, nr_unsync_leaf = 0; for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { clear_unsync_child_bit(sp, i); continue; } child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); if (!ret) { clear_unsync_child_bit(sp, i); continue; } else if (ret > 0) { nr_unsync_leaf += ret; } else return ret; } else if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; } else clear_unsync_child_bit(sp, i); } return nr_unsync_leaf; } #define INVALID_INDEX (-1) static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { pvec->nr = 0; if (!sp->unsync_children) return 0; mmu_pages_add(pvec, sp, INVALID_INDEX); return __mmu_unsync_walk(sp, pvec); } static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); static bool sp_has_gptes(struct kvm_mmu_page *sp) { if (sp->role.direct) return false; if (sp->role.passthrough) return false; return true; } #define for_each_valid_sp(_kvm, _sp, _list) \ hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; /* * Ignore various flags when verifying that it's safe to sync a shadow * page using the current MMU context. * * - level: not part of the overall MMU role and will never match as the MMU's * level tracks the root level * - access: updated based on the new guest PTE * - quadrant: not part of the overall MMU role (similar to level) */ const union kvm_mmu_page_role sync_role_ign = { .level = 0xf, .access = 0x7, .quadrant = 0x3, .passthrough = 0x1, }; /* * Direct pages can never be unsync, and KVM should never attempt to * sync a shadow page for a different MMU context, e.g. if the role * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the * reserved bits checks will be wrong, etc... */ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) return false; return true; } static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { /* sp->spt[i] has initial value of shadow page table allocation */ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0; return vcpu->arch.mmu->sync_spte(vcpu, sp, i); } static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int flush = 0; int i; if (!kvm_sync_page_check(vcpu, sp)) return -1; for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { int ret = kvm_sync_spte(vcpu, sp, i); if (ret < -1) return -1; flush |= ret; } /* * Note, any flush is purely for KVM's correctness, e.g. when dropping * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier * unmap or dirty logging event doesn't fail to flush. The guest is * responsible for flushing the TLB to ensure any changes in protection * bits are recognized, i.e. until the guest flushes or page faults on * a relevant address, KVM is architecturally allowed to let vCPUs use * cached translations with the old protection bits. */ return flush; } static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int ret = __kvm_sync_page(vcpu, sp); if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, struct list_head *invalid_list, bool remote_flush) { if (!remote_flush && list_empty(invalid_list)) return false; if (!list_empty(invalid_list)) kvm_mmu_commit_zap_page(kvm, invalid_list); else kvm_flush_remote_tlbs(kvm); return true; } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->role.invalid) return true; /* TDP MMU pages do not use the MMU generation. */ return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ for (i = mmu_pages_first(&pvec, &parents); \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) static int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, int i) { int n; for (n = i+1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; unsigned idx = pvec->page[n].idx; int level = sp->role.level; parents->idx[level-1] = idx; if (level == PG_LEVEL_4K) break; parents->parent[level-2] = sp; } return n; } static int mmu_pages_first(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents) { struct kvm_mmu_page *sp; int level; if (pvec->nr == 0) return 0; WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; /* Also set up a sentinel. Further entries in pvec are all * children of sp, so this element is never overwritten. */ parents->parent[level-1] = NULL; return mmu_pages_next(pvec, parents, 0); } static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; unsigned int level = 0; do { unsigned int idx = parents->idx[level]; sp = parents->parent[level]; if (!sp) return; WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); } static int mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; LIST_HEAD(invalid_list); bool flush = false; while (mmu_unsync_walk(parent, &pages)) { bool protected = false; for_each_sp(pages, sp, parents, i) protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); if (protected) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } for_each_sp(pages, sp, parents, i) { kvm_unlink_unsync_page(vcpu->kvm, sp); flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0; mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; } cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { atomic_set(&sp->write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) { __clear_sp_write_flooding_count(sptep_to_sp(spte)); } /* * The vCPU is required when finding indirect shadow pages; the shadow * page may already exist and syncing it needs the vCPU pointer in * order to read guest page tables. Direct shadow pages are never * unsync, thus @vcpu can be NULL if @role.direct is true. */ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; } if (sp->role.word != role.word) { /* * If the guest is creating an upper-level page, zap * unsync pages for the same gfn. While it's possible * the guest is using recursive page tables, in all * likelihood the guest has stopped using the unsync * page and is installing a completely unrelated page. * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ if (role.level > PG_LEVEL_4K && sp->unsync) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } /* unsync and write-flooding only apply to indirect SPs. */ if (sp->role.direct) goto out; if (sp->unsync) { if (KVM_BUG_ON(!vcpu, kvm)) break; /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) * it doesn't write-protect the page or mark it synchronized! * This way the validity of the mapping is ensured, but the * overhead of write protection is not incurred until the * guest invalidates the TLB mapping. This allows multiple * SPs for a single gfn to be unsync. * * If the sync fails, the page is zapped. If so, break * in order to rebuild it. */ ret = kvm_sync_page(vcpu, sp, &invalid_list); if (ret < 0) break; WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); goto out; } sp = NULL; ++kvm->stat.mmu_cache_miss; out: kvm_mmu_commit_zap_page(kvm, &invalid_list); if (collisions > kvm->stat.max_mmu_page_hash_collisions) kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } /* Caches used when allocating a new shadow page. */ struct shadow_page_caches { struct kvm_mmu_memory_cache *page_header_cache; struct kvm_mmu_memory_cache *shadow_page_cache; struct kvm_mmu_memory_cache *shadowed_info_cache; }; static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, struct shadow_page_caches *caches, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; list_add(&sp->link, &kvm->arch.active_mmu_pages); kvm_account_mmu_page(kvm, sp); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (sp_has_gptes(sp)) account_shadowed(kvm, sp); return sp; } /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, struct shadow_page_caches *caches, gfn_t gfn, union kvm_mmu_page_role role) { struct hlist_head *sp_list; struct kvm_mmu_page *sp; bool created = false; sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); if (!sp) { created = true; sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } trace_kvm_mmu_get_page(sp, created); return sp; } static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, gfn_t gfn, union kvm_mmu_page_role role) { struct shadow_page_caches caches = { .page_header_cache = &vcpu->arch.mmu_page_header_cache, .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, }; return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); } static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) { struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); union kvm_mmu_page_role role; role = parent_sp->role; role.level--; role.access = access; role.direct = direct; role.passthrough = 0; /* * If the guest has 4-byte PTEs then that means it's using 32-bit, * 2-level, non-PAE paging. KVM shadows such guests with PAE paging * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must * shadow each guest page table with multiple shadow page tables, which * requires extra bookkeeping in the role. * * Specifically, to shadow the guest's page directory (which covers a * 4GiB address space), KVM uses 4 PAE page directories, each mapping * 1GiB of the address space. @role.quadrant encodes which quarter of * the address space each maps. * * To shadow the guest's page tables (which each map a 4MiB region), KVM * uses 2 PAE page tables, each mapping a 2MiB region. For these, * @role.quadrant encodes which half of the region they map. * * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow * PDPTEs; those 4 PAE page directories are pre-allocated and their * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume * bit 21 in the PTE (the child here), KVM propagates that bit to the * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE * covers bit 21 (see above), thus the quadrant is calculated from the * _least_ significant bit of the PDE index. */ if (role.has_4_byte_gpte) { WARN_ON_ONCE(role.level != PG_LEVEL_4K); role.quadrant = spte_index(sptep) & 1; } return role; } static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, bool direct, unsigned int access) { union kvm_mmu_page_role role; if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) return ERR_PTR(-EEXIST); role = kvm_mmu_child_role(sptep, direct, access); return kvm_mmu_get_shadow_page(vcpu, gfn, role); } static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) { iterator->addr = addr; iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->root_role.level; if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->root_role.direct) iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ BUG_ON(root != vcpu->arch.mmu->root.hpa); iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; } } static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa, addr); } static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PG_LEVEL_4K) return false; iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { __shadow_walk_next(iterator, *iterator->sptep); } static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); /* * If an SPTE is present already, it must be a leaf and therefore * a large one. Drop it, and flush the TLB if needed, before * installing sp. */ if (is_shadow_present_pte(*sptep)) drop_large_spte(kvm, sptep, flush); spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); mmu_page_add_parent_pte(cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For * L1 sp, the pagetable is updated via kvm_sync_page() in * kvm_mmu_find_shadow_page() without write-protecting the gfn, * so sp->unsync can be true or false. For higher level non-direct * sp, the pagetable is updated/synced via mmu_sync_children() in * FNAME(fetch)(), so sp->unsync_children can only be false. * WARN_ON_ONCE() if anything happens unexpectedly. */ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; /* * For the direct sp, if the guest pte's dirty bit * changed form clean to dirty, it will corrupt the * sp's access: allow writable in the read-only sp, * so we should update the spte at this point to get * a new sp with the correct access. */ child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; drop_parent_pte(vcpu->kvm, child, sptep); kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } /* Returns the number of zapped non-leaf child shadow pages. */ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { child = spte_to_child_sp(pte); drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are * unlikely to be used again in the near future. This * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && child->role.guest_mode && !child->parent_ptes.val) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; } static int kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int zapped = 0; unsigned i; for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *parent, struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; struct kvm_mmu_pages pages; if (parent->role.level == PG_LEVEL_4K) return 0; while (mmu_unsync_walk(parent, &pages)) { struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } } return zapped; } static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list, int *nr_zapped) { bool list_unstable, zapped_root = false; lockdep_assert_held_write(&kvm->mmu_lock); trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; if (!sp->role.invalid && sp_has_gptes(sp)) unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ (*nr_zapped)++; /* * Already invalid pages (previously active roots) are not on * the active page list. See list_del() in the "else" case of * !sp->root_count. */ if (sp->role.invalid) list_add(&sp->link, invalid_list); else list_move(&sp->link, invalid_list); kvm_unaccount_mmu_page(kvm, sp); } else { /* * Remove the active root from the active page list, the root * will be explicitly freed when the root_count hits zero. */ list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also * treats invalid shadow pages as being obsolete. */ zapped_root = !is_obsolete_sp(kvm, sp); } if (sp->nx_huge_page_disallowed) unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; /* * Make the request to free obsolete roots after marking the root * invalid, otherwise other vCPUs may not see it as invalid. */ if (zapped_root) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); return list_unstable; } static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int nr_zapped; __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { struct kvm_mmu_page *sp, *nsp; if (list_empty(invalid_list)) return; /* * We need to make sure everyone sees our modifications to * the page tables and see changes to vcpu->mode here. The barrier * in the kvm_flush_remote_tlbs() achieves this. This pairs * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. * * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp); } } static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, unsigned long nr_to_zap) { unsigned long total_zapped = 0; struct kvm_mmu_page *sp, *tmp; LIST_HEAD(invalid_list); bool unstable; int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) return 0; restart: list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { /* * Don't zap active root pages, the page itself can't be freed * and zapping it will just force vCPUs to realloc and reload. */ if (sp->root_count) continue; unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &nr_zapped); total_zapped += nr_zapped; if (total_zapped >= nr_to_zap) break; if (unstable) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm->stat.mmu_recycled += total_zapped; return total_zapped; } static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) return kvm->arch.n_max_mmu_pages - kvm->arch.n_used_mmu_pages; return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); /* * Note, this check is intentionally soft, it only guarantees that one * page is available, while the caller may end up allocating as many as * four pages, e.g. for PAE roots or for 5-level paging. Temporarily * exceeding the (arbitrary by default) limit will not harm the host, * being too aggressive may unnecessarily kill the guest, and getting an * exact count is far more trouble than it's worth, especially in the * page fault paths. */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; } /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - goal_nr_mmu_pages); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; write_unlock(&kvm->mmu_lock); } bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry) { struct kvm *kvm = vcpu->kvm; LIST_HEAD(invalid_list); struct kvm_mmu_page *sp; gpa_t gpa = cr2_or_gpa; bool r = false; /* * Bail early if there aren't any write-protected shadow pages to avoid * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked * by a third party. Reading indirect_shadow_pages without holding * mmu_lock is safe, as this is purely an optimization, i.e. a false * positive is benign, and a false negative will simply result in KVM * skipping the unprotect+retry path, which is also an optimization. */ if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) goto out; if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); if (gpa == INVALID_GPA) goto out; } write_lock(&kvm->mmu_lock); for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); /* * Snapshot the result before zapping, as zapping will remove all list * entries, i.e. checking the list later would yield a false negative. */ r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); out: if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; } return r; } static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); } /* * Attempt to unsync any shadow pages that can be reached by the specified gfn, * KVM is creating a writable mapping for said gfn. Returns 0 if all pages * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool synchronizing, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* * The page is not write-tracked, mark existing shadow pages unsync * unless KVM is synchronizing an unsync SP. In that case, KVM must * complete emulation of the guest TLB flush before allowing shadow * pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { if (synchronizing) return -EPERM; if (sp->unsync) continue; if (prefetch) return -EEXIST; /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync * logic is not thread safe. Take the spinklock regardless of * the MMU type to avoid extra conditionals/parameters, there's * no meaningful penalty if mmu_lock is held for write. */ if (!locked) { locked = true; spin_lock(&kvm->arch.mmu_unsync_pages_lock); /* * Recheck after taking the spinlock, a different vCPU * may have since marked the page unsync. A false * negative on the unprotected check above is not * possible as clearing sp->unsync _must_ hold mmu_lock * for write, i.e. unsync cannot transition from 1->0 * while this CPU holds mmu_lock for read (or write). */ if (READ_ONCE(sp->unsync)) continue; } WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) spin_unlock(&kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible * before the SPTE is updated to allow writes because * kvm_mmu_sync_roots() checks the unsync flags without holding * the MMU lock and so can race with this. If the SPTE was updated * before the page had been marked as unsync-ed, something like the * following could happen: * * CPU 1 CPU 2 * --------------------------------------------------------------------- * 1.2 Host updates SPTE * to be writable * 2.1 Guest writes a GPTE for GVA X. * (GPTE being in the guest page table shadowed * by the SP from CPU 1.) * This reads SPTE during the page table walk. * Since SPTE.W is read as 1, there is no * fault. * * 2.2 Guest issues TLB flush. * That causes a VM Exit. * * 2.3 Walking of unsync pages sees sp->unsync is * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, * so the old mapping for GVA X incorrectly * gets used. * 1.1 Host marks SP * as unsync * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus * the situation in 2.4 does not arise. It pairs with the read barrier * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. */ smp_wmb(); return 0; } static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, u64 *sptep, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); int level = sp->role.level; int was_rmapped = 0; int ret = RET_PF_FIXED; bool flush = false; bool wrprot; u64 spte; /* Prefetching always gets a writable pfn. */ bool host_writable = !fault || fault->map_writable; bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); return RET_PF_EMULATE; } if (is_shadow_present_pte(*sptep)) { if (prefetch) return RET_PF_SPURIOUS; /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { drop_spte(vcpu->kvm, sptep); flush = true; } else was_rmapped = 1; } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, false, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; } else { flush |= mmu_spte_update(sptep, spte); trace_kvm_mmu_set_spte(level, gfn, sptep); } if (wrprot && write_fault) ret = RET_PF_WRITE_PROTECTED; if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); rmap_add(vcpu, slot, sptep, gfn, pte_access); } else { /* Already rmapped but the pte_access bits may have changed. */ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; } static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep, int nr_pages, unsigned int access) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; int i; if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM)) return false; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return false; nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages); if (nr_pages <= 0) return false; for (i = 0; i < nr_pages; i++, gfn++, sptep++) { mmu_set_spte(vcpu, slot, sptep, access, gfn, page_to_pfn(pages[i]), NULL); /* * KVM always prefetches writable pages from the primary MMU, * and KVM can make its SPTE writable in the fast page handler, * without notifying the primary MMU. Mark pages/folios dirty * now to ensure file data is written back if it ends up being * written by the guest. Because KVM's prefetching GUPs * writable PTEs, the probability of unnecessary writeback is * extremely low. */ kvm_release_page_dirty(pages[i]); } return true; } static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) { gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); unsigned int access = sp->role.access; return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access); } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *sptep) { u64 *spte, *start = NULL; int i; WARN_ON_ONCE(!sp->role.direct); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return; start = NULL; } else if (!start) start = spte; } if (start) direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between * actually accessed translations and prefetched, so disable pte * prefetch if accessed bits aren't available. */ if (sp_ad_disabled(sp)) return; if (sp->role.level > PG_LEVEL_4K) return; /* * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); } /* * Lookup the mapping level for @gfn in the current mm. * * WARNING! Use of host_pfn_mapping_level() requires the caller and the end * consumer to be tied into KVM's handlers for MMU notifier events! * * There are several ways to safely use this helper: * * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation * event for the hva. This can be done by explicit checking the MMU notifier * or by ensuring that KVM already has a valid mapping that covers the hva. * * - Do not use the result to install new mappings, e.g. use the host mapping * level only to decide whether or not to zap an entry. In this case, it's * not required to hold mmu_lock (though it's highly likely the caller will * want to hold mmu_lock anyways, e.g. to modify SPTEs). * * Note! The lookup can still race with modifications to host page tables, but * the above "rules" ensure KVM will not _consume_ the result of the walk if a * race with the primary MMU occurs. */ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the * "writable" check in __gfn_to_hva_many(), which will always fail on * read-only memslots due to gfn_to_hva() assuming writes. Earlier * page fault steps have already verified the guest isn't writing a * read-only memslot. */ hva = __gfn_to_hva_memslot(slot, gfn); /* * Disable IRQs to prevent concurrent tear down of host page tables, * e.g. if the primary MMU promotes a P*D to a huge page and then frees * the original page table. */ local_irq_save(flags); /* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; p4d = READ_ONCE(*p4d_offset(&pgd, hva)); if (p4d_none(p4d) || !p4d_present(p4d)) goto out; pud = READ_ONCE(*pud_offset(&p4d, hva)); if (pud_none(pud) || !pud_present(pud)) goto out; if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } pmd = READ_ONCE(*pmd_offset(&pud, hva)); if (pmd_none(pmd) || !pmd_present(pmd)) goto out; if (pmd_leaf(pmd)) level = PG_LEVEL_2M; out: local_irq_restore(flags); return level; } static int __kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } if (is_private) return max_level; if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; if (unlikely(fault->max_level == PG_LEVEL_4K)) return; if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn, fault->max_level, fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; /* * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); fault->pfn &= ~mask; } void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch), * direct_map(), or kvm_tdp_mmu_map() would like to create a * large PTE instead: just force them to go down another level, * patching back for them into pfn the next 9 bits of the * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); fault->pfn |= fault->gfn & page_mask; fault->goal_level--; } } static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; int ret; gfn_t base_gfn = fault->gfn; kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); if (sp == ERR_PTR(-EEXIST)) continue; link_shadow_page(vcpu, it.sptep, sp); if (fault->huge_page_disallowed) account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; direct_pte_prefetch(vcpu, it.sptep); return ret; } static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long hva = gfn_to_hva_memslot(slot, gfn); send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; if (fault->pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } return -EFAULT; } static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { gva_t gva = fault->is_tdp ? 0 : fault->addr; if (fault->is_private) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an * MMIO SPTE will just be an expensive nop. */ if (unlikely(!enable_mmio_caching)) return RET_PF_EMULATE; /* * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, * any guest that generates such gfns is running nested and is being * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and * only if L1's MAXPHYADDR is inaccurate with respect to the * hardware's). */ if (unlikely(fault->gfn > kvm_mmu_max_gfn())) return RET_PF_EMULATE; return RET_PF_CONTINUE; } static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only * reach the common page fault handler if the SPTE has an invalid MMIO * generation number. Refreshing the MMIO generation needs to go down * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag! */ if (fault->rsvd) return false; /* * For hardware-protected VMs, certain conditions like attempting to * perform a write to a page which is not in the state that the guest * expects it to be in can result in a nested/extended #PF. In this * case, the below code might misconstrue this situation as being the * result of a write-protected access, and treat it as a spurious case * rather than taking any action to satisfy the real source of the #PF * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the * guest spinning on a #PF indefinitely, so don't attempt the fast path * in this case. * * Note that the kvm_mem_is_private() check might race with an * attribute update, but this will either result in the guest spinning * on RET_PF_SPURIOUS until the update completes, or an actual spurious * case might go down the slow path. Either case will resolve itself. */ if (kvm->arch.has_private_mem && fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) return false; /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are * disabled _by KVM_, which could mean that the fault is potentially * caused by access tracking (if enabled). If A/D bits are enabled * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D * bits for L2 and employ access tracking, but the fast page fault * mechanism only supports direct MMUs. * 2. The shadow page table entry is present, the access is a write, * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e. * the fault was caused by a write-protection violation. If the * SPTE is MMU-writable (determined later), the fault can be fixed * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore * the "exec" flag. */ return fault->write; } /* * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value. */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML * enabled, so we do not do this. This might result in the same GPA * to be logged in PML buffer again when the write really happens, and * eventually to be called by mark_page_dirty twice. But it's also no * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * * Compare with make_spte() where instead shadow_dirty_mask is set. */ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { if (fault->exec) return is_executable_pte(spte); if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ return spte & PT_PRESENT_MASK; } /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no * walk could be performed, returns NULL and *spte does not contain valid data. * * Contract: * - Must be called between walk_shadow_page_lockless_{begin,end}. * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; u64 old_spte; u64 *sptep = NULL; for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; } return sptep; } /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte; u64 *sptep; uint retry_count = 0; if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); do { u64 new_spte; if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); /* * It's entirely possible for the mapping to have been zapped * by a different task, but the root page should always be * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; /* * Check whether the memory access that caused the fault would * still cause it if it were to be performed right now. If not, * then this is a spurious fault caused by TLB lazily flushed, * or some other CPU has already fixed the PTE after the * current CPU took the fault. * * Need not check the access of upper level table entries since * they are always ACC_ALL. */ if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } new_spte = spte; /* * KVM only supports fixing page faults outside of MMU lock for * direct MMUs, nested MMUs are always indirect, and KVM always * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) new_spte = restore_acc_track_spte(new_spte) | shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can * be made fully writable outside of mmu_lock, e.g. only SPTEs * that were write-protected for dirty-logging or access * tracking are handled here. Don't bother checking if the * SPTE is writable to prioritize running with A/D bits enabled. * The is_access_allowed() check above handles the common case * of the fault being spurious, and the SPTE is known to be * shadow-present, i.e. except for access tracking restoration * making the new SPTE writable, the check is wasteful. */ if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* * Do not fix write-permission on the large spte when * dirty logging is enabled. Since we only dirty the * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. */ if (sp->role.level > PG_LEVEL_4K && kvm_slot_dirty_track_enabled(fault->slot)) break; } /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || !is_access_allowed(fault, new_spte)) break; /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } if (++retry_count > 4) { pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } } while (true); trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); if (ret != RET_PF_INVALID) vcpu->stat.pf_fast++; return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { struct kvm_mmu_page *sp; if (!VALID_PAGE(*root_hpa)) return; sp = root_to_sp(*root_hpa); if (WARN_ON_ONCE(!sp)) return; if (is_tdp_mmu_page(sp)) { lockdep_assert_held_read(&kvm->mmu_lock); kvm_tdp_mmu_put_root(kvm, sp); } else { lockdep_assert_held_write(&kvm->mmu_lock); if (!--sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } *root_hpa = INVALID_PAGE; } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT) && VALID_PAGE(mmu->root.hpa); if (!free_active_root) { for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && VALID_PAGE(mmu->prev_roots[i].hpa)) break; if (i == KVM_MMU_NUM_PREV_ROOTS) return; } if (is_tdp_mmu) read_lock(&kvm->mmu_lock); else write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { /* Nothing to cleanup for dummy roots. */ } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) continue; mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->pae_root[i] = INVALID_PAE_ROOT; } } mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; } if (is_tdp_mmu) { read_unlock(&kvm->mmu_lock); WARN_ON_ONCE(!list_empty(&invalid_list)); } else { kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; struct kvm_mmu_page *sp; hpa_t root_hpa; int i; /* * This should not be called while L2 is active, L2 can't invalidate * _only_ its own roots, e.g. INVVPID unconditionally exits. */ WARN_ON_ONCE(mmu->root_role.guest_mode); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { root_hpa = mmu->prev_roots[i].hpa; if (!VALID_PAGE(root_hpa)) continue; sp = root_to_sp(root_hpa); if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } kvm_mmu_free_roots(kvm, mmu, roots_to_free); } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; role.level = level; role.quadrant = quadrant; WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); } static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u8 shadow_root_level = mmu->root_role.level; hpa_t root; unsigned i; int r; if (tdp_mmu_enabled) return kvm_tdp_mmu_alloc_root(vcpu); write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; goto out_unlock; } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } mmu->root.hpa = __pa(mmu->pae_root); } else { WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); r = -EIO; goto out_unlock; } /* root.pgd is ignored for direct MMUs. */ mmu->root.pgd = 0; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; } static int mmu_first_shadow_root_alloc(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; int r = 0, i, bkt; /* * Check if this is the first shadow root being allocated before * taking the lock. */ if (kvm_shadow_root_allocated(kvm)) return 0; mutex_lock(&kvm->slots_arch_lock); /* Recheck, under the lock, whether this is the first shadow root. */ if (kvm_shadow_root_allocated(kvm)) goto out_unlock; /* * Check if anything actually needs to be allocated, e.g. all metadata * will be allocated upfront if TDP is disabled. */ if (kvm_memslots_have_rmaps(kvm) && kvm_page_track_write_tracking_enabled(kvm)) goto out_success; for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) { /* * Both of these functions are no-ops if the target is * already allocated, so unconditionally calling both * is safe. Intentionally do NOT free allocations on * failure to avoid having to track which allocations * were made now versus when the memslot was created. * The metadata is guaranteed to be freed when the slot * is freed, and will be kept/used if userspace retries * KVM_RUN instead of killing the VM. */ r = memslot_rmap_alloc(slot, slot->npages); if (r) goto out_unlock; r = kvm_page_track_write_tracking_alloc(slot); if (r) goto out_unlock; } } /* * Ensure that shadow_root_allocated becomes true strictly after * all the related pointers are set. */ out_success: smp_store_release(&kvm->arch.shadow_root_allocated, true); out_unlock: mutex_unlock(&kvm->slots_arch_lock); return r; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; int quadrant, i, r; hpa_t root; root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { mmu->root.hpa = kvm_mmu_get_dummy_root(); return 0; } /* * On SVM, reading PDPTRs might access guest memory, which might fault * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. */ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { pdptrs[i] = mmu->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) pdptrs[i] = 0; } } r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r; write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; goto out_unlock; } /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_value; if (mmu->root_role.level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; if (mmu->root_role.level == PT64_ROOT_5LEVEL) { if (WARN_ON_ONCE(!mmu->pml5_root)) { r = -EIO; goto out_unlock; } mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; } } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { if (!(pdptrs[i] & PT_PRESENT_MASK)) { mmu->pae_root[i] = INVALID_PAE_ROOT; continue; } root_gfn = pdptrs[i] >> PAGE_SHIFT; } /* * If shadowing 32-bit non-PAE page tables, each PAE page * directory maps one quarter of the guest's non-PAE page * directory. Othwerise each PAE page direct shadows one guest * PAE page directory so that quadrant should be 0. */ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } if (mmu->root_role.level == PT64_ROOT_5LEVEL) mmu->root.hpa = __pa(mmu->pml5_root); else if (mmu->root_role.level == PT64_ROOT_4LEVEL) mmu->root.hpa = __pa(mmu->pml4_root); else mmu->root.hpa = __pa(mmu->pae_root); set_root_pgd: mmu->root.pgd = root_pgd; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; } static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL; u64 *pml5_root = NULL; u64 *pml4_root = NULL; u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP * tables are allocated and initialized at root creation as there is no * equivalent level in the guest's NPT to shadow. Allocate the tables * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ if (mmu->root_role.direct || mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL || mmu->root_role.level < PT64_ROOT_4LEVEL) return 0; /* * NPT, the only paging mode that uses this horror, uses a fixed number * of levels for the shadow page tables, e.g. all MMUs are 4-level or * all MMus are 5-level. Thus, this can safely require that pml5_root * is allocated if the other roots are valid and pml5 is needed, as any * prior MMU would also have required pml5. */ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || (need_pml5 && mmu->pml5_root))) return -EIO; /* * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and * doesn't need to be decrypted. */ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pae_root) return -ENOMEM; #ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml4_root) goto err_pml4; if (need_pml5) { pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml5_root) goto err_pml5; } #endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; mmu->pml5_root = pml5_root; return 0; #ifdef CONFIG_X86_64 err_pml5: free_page((unsigned long)pml4_root); err_pml4: free_page((unsigned long)pae_root); return -ENOMEM; #endif } static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* * The read barrier orders the CPU's read of SPTE.W during the page table * walk before the reads of sp->unsync/sp->unsync_children here. * * Even if another CPU was marking the SP as unsync-ed simultaneously, * any guest page table changes are not guaranteed to be visible anyway * until this VCPU issues a TLB flush strictly after those changes are * made. We only need to ensure that the other CPU sets these flags * before any actual changes to the page tables are made. The comments * in mmu_try_to_unsync_pages() describe what could go wrong if this * requirement isn't satisfied. */ smp_rmb(); sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the * PDPTEs for a given PAE root need to be synchronized individually. */ if (WARN_ON_ONCE(!sp)) return false; if (sp->unsync || sp->unsync_children) return true; return false; } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; if (vcpu->arch.mmu->root_role.direct) return; if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; if (!is_unsync_root(root)) return; sp = root_to_sp(root); write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); return; } write_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } write_unlock(&vcpu->kvm->mmu_lock); } void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) { unsigned long roots_to_free = 0; int i; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); /* sync prev_roots by simply freeing them */ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t vaddr, u64 access, struct x86_exception *exception) { if (exception) exception->error_code = 0; return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { /* * A nested guest cannot use the MMIO cache if it is using nested * page tables, because cr2 is a nGPA while the cache stores GPAs. */ if (mmu_is_nested(vcpu)) return false; if (direct) return vcpu_match_mmio_gpa(vcpu, addr); return vcpu_match_mmio_gva(vcpu, addr); } /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. * * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct kvm_shadow_walk_iterator iterator; int leaf = -1; u64 spte; for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; } return leaf; } static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { int leaf; walk_shadow_page_lockless_begin(vcpu); if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); return leaf; } /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; int root, leaf, level; bool reserved = false; leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; } *sptep = sptes[leaf]; /* * Skip reserved bits checks on the terminal leaf if it's not a valid * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by * design, always have reserved bits set. The purpose of the checks is * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. */ if (!is_shadow_present_pte(sptes[leaf])) leaf++; rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", sptes[level], level, get_rsvd_bits(rsvd_check, sptes[level], level)); } return reserved; } static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON_ONCE(reserved)) return -EINVAL; if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ return RET_PF_RETRY; } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (unlikely(fault->rsvd)) return false; if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; } static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) { struct kvm_shadow_walk_iterator iterator; u64 spte; walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); walk_shadow_page_lockless_end(vcpu); } static u32 alloc_apf_token(struct kvm_vcpu *vcpu) { /* make sure the token value is not 0 */ u32 id = vcpu->arch.apf.id; if (id << 12 == 0) vcpu->arch.apf.id = 1; return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) return; if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; r = kvm_mmu_reload(vcpu); if (unlikely(r)) return; if (!vcpu->arch.mmu->root_role.direct && work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL, NULL); /* * Account fixed page faults, otherwise they'll never be counted, but * ignore stats for all other return times. Page-ready "faults" aren't * truly spurious and never trigger emulation */ if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) { BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) return PG_LEVEL_1G; if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) return PG_LEVEL_2M; return PG_LEVEL_4K; } static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, u8 max_level, int gmem_order) { u8 req_max_level; if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; max_level = min(kvm_max_level_for_order(gmem_order), max_level); if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); if (req_max_level) max_level = min(max_level, req_max_level); return max_level; } static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, r == RET_PF_RETRY, fault->map_writable); } static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int max_order, r; if (!kvm_slot_can_be_private(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; } fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, fault->max_level, max_order); return RET_PF_CONTINUE; } static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { unsigned int foll = fault->write ? FOLL_WRITE : 0; if (fault->is_private) return kvm_mmu_faultin_pfn_private(vcpu, fault); foll |= FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, &fault->map_writable, &fault->refcounted_page); /* * If resolving the page failed because I/O is needed to fault-in the * page, then either set up an asynchronous #PF to do the I/O, or if * doing an async #PF isn't possible, retry with I/O allowed. All * other failures are terminal, i.e. retrying won't help. */ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } /* * Allow gup to bail on pending non-fatal signals when it's also allowed * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ foll |= FOLL_INTERRUPTIBLE; foll &= ~FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, &fault->map_writable, &fault->refcounted_page); return RET_PF_CONTINUE; } static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; /* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an * invalidation relate to fault->fn and resume the guest without * installing a mapping in the page tables. */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); /* * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch. */ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (unlikely(!slot)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY; if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* * Don't map L1's APIC access page into L2, KVM doesn't support * using APICv/AVIC to accelerate L2 accesses to L1's APIC, * i.e. the access needs to be emulated. Emulating access to * L1's APIC is also correct if L1 is accelerating L2's own * virtual APIC, but for some reason L1 also maps _L1's_ APIC * into L2. Note, vcpu_is_mmio_gpa() always treats access to * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM * uses different roots for L1 vs. L2, i.e. there is no danger * of breaking APICv/AVIC for L1. */ if (is_guest_mode(vcpu)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. * * For mmu_lock, if there is an in-progress invalidation and the kernel * allows preemption, the invalidation task may drop mmu_lock and yield * in response to mmu_lock being contended, which is *very* counter- * productive as this vCPU can't actually make forward progress until * the invalidation completes. * * Retrying now can also avoid unnessary lock contention in the primary * MMU, as the primary MMU doesn't necessarily hold a single lock for * the duration of the invalidation, i.e. faulting in a conflicting pfn * can cause the invalidation to take longer by holding locks that are * needed to complete the invalidation. * * Do the pre-check even for non-preemtible kernels, i.e. even if KVM * will never yield mmu_lock in response to contention, as this vCPU is * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Check again for a relevant mmu_notifier invalidation event purely to * avoid contending mmu_lock. Most invalidations will be detected by * the previous check, but checking is extremely cheap relative to the * overall cost of failing to detect the invalidation until after * mmu_lock is acquired. */ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY; } return RET_PF_CONTINUE; } /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) return true; /* * Roots without an associated shadow page are considered invalid if * there is a pending request to free obsolete roots. The request is * only a hint that the current root _may_ be obsolete and needs to be * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs * to reload even if no vCPU is actively using the root. */ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; /* * Check for a relevant mmu_notifier invalidation event one last time * now that mmu_lock is held, as the "unsafe" checks performed without * holding mmu_lock can get false negatives. */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int r; /* Dummy roots are used only for shadowing bad guest roots. */ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; r = mmu_topup_memory_caches(vcpu, false); if (r) return r; r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = direct_map(vcpu, fault); out_unlock: kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); return r; } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif /* * Legacy #PF exception only have a 32-bit error code. Simply drop the * upper bits as KVM doesn't use them for #PF (because they are never * set), and to ensure there are no collisions with KVM-defined bits. */ if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); /* * Restrict KVM-defined flags to bits 63:32 so that it's impossible for * them to conflict with #PF error codes, which are limited to 32 bits. */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); } else { WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); #ifdef CONFIG_X86_64 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int r; if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; r = mmu_topup_memory_caches(vcpu, false); if (r) return r; r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; read_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: kvm_mmu_finish_page_fault(vcpu, fault, r); read_unlock(&vcpu->kvm->mmu_lock); return r; } #endif bool kvm_mmu_may_ignore_guest_pat(void) { /* * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to * honor the memtype from the guest's PAT so that guest accesses to * memory that is DMA'd aren't cached against the guest's wishes. As a * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, * KVM _always_ ignores guest PAT (when EPT is enabled). */ return shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) return kvm_tdp_mmu_page_fault(vcpu, fault); #endif return direct_page_fault(vcpu, fault); } static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; /* * Restrict to TDP page fault, since that's the only case where the MMU * is indexed by GPA. */ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) return -EOPNOTSUPP; do { if (signal_pending(current)) return -EINTR; cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); if (r < 0) return r; switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: case RET_PF_WRITE_PROTECTED: return 0; case RET_PF_EMULATE: return -ENOENT; case RET_PF_RETRY: case RET_PF_CONTINUE: case RET_PF_INVALID: default: WARN_ONCE(1, "could not fix page fault during prefault"); return -EIO; } } long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; u64 end; int r; if (!vcpu->kvm->arch.pre_fault_allowed) return -EOPNOTSUPP; /* * reload is efficient when called repeatedly, so we can do it on * every iteration. */ r = kvm_mmu_reload(vcpu); if (r) return r; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); if (r < 0) return r; /* * If the mapping that covers range->gpa can use a huge page, it * may start below it or end after range->gpa + range->size. */ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); return min(range->size, end - range->gpa); } static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_spte = NULL; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root->hpa)) return false; if (!role.direct && pgd != root->pgd) return false; sp = root_to_sp(root->hpa); if (WARN_ON_ONCE(!sp)) return false; return role.word == sp->role.word; } /* * Find out if a previously cached root matching the new pgd/role is available, * and insert the current root as the MRU in the cache. * If a matching root is found, it is assigned to kvm_mmu->root and * true is returned. * If no match is found, kvm_mmu->root is left invalid, the LRU root is * evicted to make room for the current root, and false is returned. */ static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { /* * The swaps end up rotating the cache like this: * C 0 1 2 3 (on entry to the function) * 0 C 1 2 3 * 1 C 0 2 3 * 2 C 0 1 3 * 3 C 0 1 2 (on exit from the loop) */ swap(mmu->root, mmu->prev_roots[i]); if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; } kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); return false; } /* * Find out if a previously cached root matching the new pgd/role is available. * On entry, mmu->root is invalid. * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry * of the cache becomes invalid, and true is returned. * If no match is found, kvm_mmu->root is left invalid and false is returned. */ static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role)) goto hit; return false; hit: swap(mmu->root, mmu->prev_roots[i]); /* Bubble up the remaining roots. */ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++) mmu->prev_roots[i] = mmu->prev_roots[i + 1]; mmu->prev_roots[i].hpa = INVALID_PAGE; return true; } static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* * Limit reuse to 64-bit hosts+VMs without "special" roots in order to * avoid having to deal with PDPTEs and other complexities. */ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role); else return cached_root_find_without_current(kvm, mmu, new_pgd, new_role); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; /* * Return immediately if no usable root was found, kvm_mmu_reload() * will establish a valid root prior to the next VM-Enter. */ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; /* * It's possible that the cached previous root page is obsolete because * of a change in the MMU generation number. However, changing the * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, * which will free the root set here and allocate a new one. */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); if (force_flush_and_sync_on_reuse) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When * switching to a new CR3, that GVA->GPA mapping may no longer be * valid. So clear any cached MMIO info even when we don't need to sync * the shadow page tables. */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); /* * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ if (!new_role.direct) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); if (!WARN_ON_ONCE(!sp)) __clear_sp_write_flooding_count(sp); } } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; } mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" #undef PTTYPE #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE #define PTTYPE 32 #include "paging_tmpl.h" #undef PTTYPE static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); if (level == PT32E_ROOT_LEVEL) high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); else high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); /* Note, NX doesn't exist in PDPTEs, this is handled below. */ if (!nx) high_bits_rsvd |= rsvd_bits(63, 63); /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. */ if (amd) nonleaf_bit8_rsvd = rsvd_bits(8, 8); switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ rsvd_check->rsvd_bits_mask[0][1] = 0; rsvd_check->rsvd_bits_mask[0][0] = 0; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; if (!pse) { rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | high_bits_rsvd | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | gbpages_bit_rsvd; rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | gbpages_bit_rsvd | rsvd_bits(13, 29); rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; } } static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, bool execonly, int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 large_1g_rsvd = 0, large_2m_rsvd = 0; u64 bad_mt_xwr; if (huge_page_level < PG_LEVEL_1G) large_1g_rsvd = rsvd_bits(7, 7); if (huge_page_level < PG_LEVEL_2M) large_2m_rsvd = rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ if (!execonly) { /* bits 0..2 must not be 100 unless VMX capabilities allow it */ bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly, int huge_page_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, execonly, huge_page_level); } static inline u64 reserved_hpa_bits(void) { return rsvd_bits(kvm_host.maxphyaddr, 63); } /* * the page table on host is the shadow page table for the page * table in guest or amd nested guest, its mmu features completely * follow the features in guest. */ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ bool is_amd = true; /* KVM doesn't use 2-level page tables for the shadow MMU. */ bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i; WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL); shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_pse, is_amd); if (!shadow_me_mask) return; for (i = context->root_role.level; --i >= 0;) { /* * So far shadow_me_value is a constant during KVM's life * time. Bits in shadow_me_value are allowed to be set. * Bits in shadow_me_mask but not in shadow_me_value are * not allowed to be set. */ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask; shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value; } } static inline bool boot_cpu_is_amd(void) { WARN_ON_ONCE(!tdp_enabled); return shadow_x_mask == 0; } /* * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection. */ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; shadow_zero_check = &context->shadow_zero_check; if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, reserved_hpa_bits(), false, max_huge_page_level); if (!shadow_me_mask) return; for (i = context->root_role.level; --i >= 0;) { shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; } } /* * as the comments in reset_shadow_zero_bits_mask() except it * is the shadow page table for intel nested guest. */ static void reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, reserved_hpa_bits(), execonly, max_huge_page_level); } #define BYTE_MASK(access) \ ((1 & (access) ? 2 : 0) | \ (2 & (access) ? 4 : 0) | \ (3 & (access) ? 8 : 0) | \ (4 & (access) ? 16 : 0) | \ (5 & (access) ? 32 : 0) | \ (6 & (access) ? 64 : 0) | \ (7 & (access) ? 128 : 0)) static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; const u8 x = BYTE_MASK(ACC_EXEC_MASK); const u8 w = BYTE_MASK(ACC_WRITE_MASK); const u8 u = BYTE_MASK(ACC_USER_MASK); bool cr4_smep = is_cr4_smep(mmu); bool cr4_smap = is_cr4_smap(mmu); bool cr0_wp = is_cr0_wp(mmu); bool efer_nx = is_efer_nx(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; /* * Each "*f" variable has a 1 bit for each UWX value * that causes a fault with the given PFEC. */ /* Faults from writes to non-writable pages */ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */ u8 smepf = 0; /* Faults from kernel mode accesses of user pages */ u8 smapf = 0; if (!ept) { /* Faults from kernel mode accesses to user pages */ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; /* Not really needed: !nx will cause pte.nx to fault */ if (!efer_nx) ff = 0; /* Allow supervisor writes if !cr0.wp */ if (!cr0_wp) wf = (pfec & PFERR_USER_MASK) ? wf : 0; /* Disallow supervisor fetches of user code if cr4.smep */ if (cr4_smep) smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; /* * SMAP:kernel-mode data accesses from user-mode * mappings should fault. A fault is considered * as a SMAP violation if all of the following * conditions are true: * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch * - The access is supervisor mode * - If implicit supervisor access or X86_EFLAGS_AC is clear * * Here, we cover the first four conditions. * The fifth is computed dynamically in permission_fault(); * PFERR_RSVD_MASK bit will be set in PFEC if the access is * *not* subject to SMAP restrictions. */ if (cr4_smap) smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; } mmu->permissions[byte] = ff | uf | wf | smepf | smapf; } } /* * PKU is an additional mechanism by which the paging controls access to * user-mode addresses based on the value in the PKRU register. Protection * key violations are reported through a bit in the page fault error code. * Unlike other bits of the error code, the PK bit is not known at the * call site of e.g. gva_to_gpa; it must be computed directly in * permission_fault based on two bits of PKRU, on some machine state (CR4, * CR0, EFER, CPL), and on other bits of the error code and the page tables. * * In particular the following conditions come from the error code, the * page tables and the machine state: * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) * - PK is always zero if U=0 in the page tables * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. * * The PKRU bitmask caches the result of these four conditions. The error * code (minus the P bit) and the page table's U bit form an index into the * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed * with the two bits of the PKRU register corresponding to the protection key. * For the first three conditions above the bits will be 00, thus masking * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ static void update_pkru_bitmask(struct kvm_mmu *mmu) { unsigned bit; bool wp; mmu->pkru_mask = 0; if (!is_cr4_pke(mmu)) return; wp = is_cr0_wp(mmu); for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; bool check_pkey, check_write, ff, uf, wf, pte_user; pfec = bit << 1; ff = pfec & PFERR_FETCH_MASK; uf = pfec & PFERR_USER_MASK; wf = pfec & PFERR_WRITE_MASK; /* PFEC.RSVD is replaced by ACC_USER_MASK. */ pte_user = pfec & PFERR_RSVD_MASK; /* * Only need to check the access which is not an * instruction fetch and is to a user page. */ check_pkey = (!ff && pte_user); /* * write access is controlled by PKRU if it is a * user access or CR0.WP = 1. */ check_write = check_pkey && wf && (uf || wp); /* PKRU.AD stops both read and write access. */ pkey_bits = !!check_pkey; /* PKRU.WD stops write access. */ pkey_bits |= (!!check_write) << 1; mmu->pkru_mask |= (pkey_bits & 3) << pfec; } } static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { if (!is_cr0_pg(mmu)) return; reset_guest_rsvds_bits_mask(vcpu, mmu); update_permission_bitmask(mmu, false); update_pkru_bitmask(mmu); } static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_spte = paging32_sync_spte; } static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) { union kvm_cpu_role role = {0}; role.base.access = ACC_ALL; role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); role.ext.valid = 1; if (!____is_cr0_pg(regs)) { role.base.direct = 1; return role; } role.base.efer_nx = ____is_efer_nx(regs); role.base.cr0_wp = ____is_cr0_wp(regs); role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs); role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs); role.base.has_4_byte_gpte = !____is_cr4_pae(regs); if (____is_efer_lma(regs)) role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; else if (____is_cr4_pae(regs)) role.base.level = PT32E_ROOT_LEVEL; else role.base.level = PT32_ROOT_LEVEL; role.ext.cr4_smep = ____is_cr4_smep(regs); role.ext.cr4_smap = ____is_cr4_smap(regs); role.ext.cr4_pse = ____is_cr4_pse(regs); /* PKEY and LA57 are active iff long mode is active. */ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); role.ext.efer_lma = ____is_efer_lma(regs); return role; } void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); if (is_cr0_wp(mmu) == cr0_wp) return; mmu->cpu_role.base.cr0_wp = cr0_wp; reset_guest_paging_metadata(vcpu, mmu); } static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; return max_tdp_level; } u8 kvm_mmu_get_max_tdp_level(void) { return tdp_root_level ? tdp_root_level : max_tdp_level; } static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { union kvm_mmu_page_role role = {0}; role.access = ACC_ALL; role.cr0_wp = true; role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; return role; } static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role); if (cpu_role.as_u64 == context->cpu_role.as_u64 && root_role.word == context->root_role.word) return; context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; context->sync_spte = NULL; context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_cr4_pae(context)) context->gva_to_gpa = paging64_gva_to_gpa; else context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(context); } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, union kvm_cpu_role cpu_role, union kvm_mmu_page_role root_role) { if (cpu_role.as_u64 == context->cpu_role.as_u64 && root_role.word == context->root_role.word) return; context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; if (!is_cr0_pg(context)) nonpaging_init_context(context); else if (is_cr4_pae(context)) paging64_init_context(context); else paging32_init_context(context); reset_guest_paging_metadata(vcpu, context); reset_shadow_zero_bits_mask(vcpu, context); } static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role; root_role = cpu_role.base; /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL); /* * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role. * KVM uses NX when TDP is disabled to handle a variety of scenarios, * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. * The iTLB multi-hit workaround can be toggled at any time, so assume * NX can be used by any non-nested shadow MMU to avoid having to reset * MMU contexts. */ root_role.efer_nx = true; shadow_mmu_init_context(vcpu, context, cpu_role, root_role); } void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs); union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ WARN_ON_ONCE(cpu_role.base.direct); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); if (root_role.level == PT64_ROOT_5LEVEL && cpu_role.base.level == PT64_ROOT_4LEVEL) root_role.passthrough = 1; shadow_mmu_init_context(vcpu, context, cpu_role, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, bool execonly, u8 level) { union kvm_cpu_role role = {0}; /* * KVM does not support SMM transfer monitors, and consequently does not * support the "entry to SMM" control either. role.base.smm is always 0. */ WARN_ON_ONCE(is_smm(vcpu)); role.base.level = level; role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; role.base.access = ACC_ALL; role.ext.word = 0; role.ext.execonly = execonly; role.ext.valid = 1; return role; } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, int huge_page_level, bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_cpu_role new_mode = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); if (new_mode.as_u64 != context->cpu_role.as_u64) { /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ context->cpu_role.as_u64 = new_mode.as_u64; context->root_role.word = new_mode.base.word; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_spte = ept_sync_spte; update_permission_bitmask(context, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(context, execonly); } kvm_mmu_new_pgd(vcpu, new_eptp); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, cpu_role); context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role new_mode) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_mode.as_u64 == g_context->cpu_role.as_u64) return; g_context->cpu_role.as_u64 = new_mode.as_u64; g_context->get_guest_pgd = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* * L2 page tables are never shadowed, so there is no need to sync * SPTEs. */ g_context->sync_spte = NULL; /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using * L2's page tables as the first level of translation and L1's * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) g_context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_long_mode(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa; else g_context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, g_context); } void kvm_init_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs); if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu, cpu_role); else if (tdp_enabled) init_kvm_tdp_mmu(vcpu, cpu_role); else init_kvm_softmmu(vcpu, cpu_role); } EXPORT_SYMBOL_GPL(kvm_init_mmu); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) { /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. * * Correctly handling multiple vCPU models with respect to paging and * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for * gfn_write_track (see struct kvm_mmu_page_role comments). For now * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.root_role.invalid = 1; vcpu->arch.guest_mmu.root_role.invalid = 1; vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); /* * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl(). */ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); kvm_init_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); if (r) goto out; r = mmu_alloc_special_roots(vcpu); if (r) goto out; if (vcpu->arch.mmu->root_role.direct) r = mmu_alloc_direct_roots(vcpu); else r = mmu_alloc_shadow_roots(vcpu); if (r) goto out; kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); /* * Flush any TLB entries for the new root, the provenance of the root * is unknown. Even if KVM ensures there are no stale TLB entries * for a freed root, in theory another hypervisor could have left * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } void kvm_mmu_unload(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); } static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root_hpa)) return false; /* * When freeing obsolete roots, treat roots as obsolete if they don't * have an associated shadow page, as it's impossible to determine if * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * * (a) only PAE paging and nested NPT have roots without shadow pages * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. * * Note! Dummy roots are unique in that they are obsoleted by memslot * _creation_! See also FNAME(fetch). */ sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp); } static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; int i; if (is_obsolete_root(kvm, mmu->root.hpa)) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots_to_free) kvm_mmu_free_roots(kvm, mmu, roots_to_free); } void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) { __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) { u64 gentry = 0; int r; /* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode since we update the sptes only * when they have the same mode. */ if (is_pae(vcpu) && *bytes == 4) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; } if (*bytes == 4 || *bytes == 8) { r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); if (r) gentry = 0; } return gentry; } /* * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ if (sp->role.level == PG_LEVEL_4K) return false; atomic_inc(&sp->write_flooding_count); return atomic_read(&sp->write_flooding_count) >= 3; } /* * Misaligned accesses are too much trouble to fix up; also, they usually * indicate a page is not used as a page table. */ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, int bytes) { unsigned offset, pte_size, misaligned; offset = offset_in_page(gpa); pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status * bits, for example, in linux, andb instruction is used in clear_bit(). */ if (!(offset & (pte_size - 1)) && bytes == 1) return false; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; return misaligned; } static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) { unsigned page_offset, quadrant; u64 *spte; int level; page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map * only 2MB. So we need to double the offset again * and zap two pdes instead of one. */ if (level == PT32_ROOT_LEVEL) { page_offset &= ~7; /* kill rounding error */ page_offset <<= 1; *nspte = 2; } quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; if (quadrant != sp->role.quadrant) return NULL; } spte = &sp->spt[page_offset / sizeof(*spte)]; return spte; } void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool flush = false; /* * When emulating guest writes, ensure the written value is visible to * any task that is handling page faults before checking whether or not * KVM is shadowing a guest PTE. This ensures either KVM will create * the correct SPTE in the page fault handler, or this task will see * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in * account_shadowed(). */ smp_mb(); if (!vcpu->kvm->arch.indirect_shadow_pages) return; write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++vcpu->kvm->stat.mmu_pte_write; for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } spte = get_written_sptes(sp, gpa, &npte); if (!spte) continue; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (is_shadow_present_pte(entry)) flush = true; ++spte; } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); write_unlock(&vcpu->kvm->mmu_lock); } static bool is_write_to_guest_page_table(u64 error_code) { const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; return (error_code & mask) == mask; } static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int *emulation_type) { bool direct = vcpu->arch.mmu->root_role.direct; /* * Do not try to unprotect and retry if the vCPU re-faulted on the same * RIP with the same address that was previously unprotected, as doing * so will likely put the vCPU into an infinite. E.g. if the vCPU uses * a non-page-table modifying instruction on the PDE that points to the * instruction, then unprotecting the gfn will unmap the instruction's * code, i.e. make it impossible for the instruction to ever complete. */ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && vcpu->arch.last_retry_addr == cr2_or_gpa) return RET_PF_EMULATE; /* * Reset the unprotect+retry values that guard against infinite loops. * The values will be refreshed if KVM explicitly unprotects a gfn and * retries, in all other cases it's safe to retry in the future even if * the next page fault happens on the same RIP+address. */ vcpu->arch.last_retry_eip = 0; vcpu->arch.last_retry_addr = 0; /* * It should be impossible to reach this point with an MMIO cache hit, * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, * writable memslot, and creating a memslot should invalidate the MMIO * cache by way of changing the memslot generation. WARN and disallow * retry if MMIO is detected, as retrying MMIO emulation is pointless * and could put the vCPU into an infinite loop because the processor * will keep faulting on the non-existent MMIO address. */ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) return RET_PF_EMULATE; /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. * If L1 is sharing (a subset of) its page tables with L2, e.g. by * having nCR3 share lower level page tables with hCR3, then when KVM * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also * unknowingly write-protecting L1's guest page tables, which KVM isn't * shadowing. * * Because the CPU (by default) walks NPT page tables using a write * access (to ensure the CPU can do A/D updates), page walks in L1 can * trigger write faults for the above case even when L1 isn't modifying * PTEs. As a result, KVM will unnecessarily emulate (or at least, try * to emulate) an excessive number of L1 instructions; because L1's MMU * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs * and thus no need to emulate in order to guarantee forward progress. * * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can * proceed without triggering emulation. If one or more shadow pages * was zapped, skip emulation and resume L1 to let it natively execute * the instruction. If no shadow pages were zapped, then the write- * fault is due to something else entirely, i.e. KVM needs to emulate, * as resuming the guest will put it into an infinite loop. * * Note, this code also applies to Intel CPUs, even though it is *very* * unlikely that an L1 will share its page tables (IA32/PAE/paging64 * format) with L2's page tables (EPT format). * * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to * unprotect the gfn and retry if an event is awaiting reinjection. If * KVM emulates multiple instructions before completing event injection, * the event could be delayed beyond what is architecturally allowed, * e.g. KVM could inject an IRQ after the TPR has been raised. */ if (((direct && is_write_to_guest_page_table(error_code)) || (!direct && kvm_event_needs_reinjection(vcpu))) && kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; /* * The gfn is write-protected, but if KVM detects its emulating an * instruction that is unlikely to be used to modify page tables, or if * emulation fails, KVM can try to unprotect the gfn and let the CPU * re-execute the instruction that caused the page fault. Do not allow * retrying an instruction from a nested guest as KVM is only explicitly * shadowing L1's page tables, i.e. unprotecting something for L1 isn't * going to magically fix whatever issue caused L2 to fail. */ if (!is_guest_mode(vcpu)) *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; return RET_PF_EMULATE; } int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; /* * Except for reserved faults (emulated MMIO is shared-only), set the * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's * current attributes, which are the source of truth for such VMs. Note, * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't * currently supported nested virtualization (among many other things) * for software-protected VMs. */ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && !(error_code & PFERR_RSVD_MASK) && vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) error_code |= PFERR_PRIVATE_ACCESS; r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) return -EFAULT; r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { vcpu->stat.pf_taken++; r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; if (r == RET_PF_WRITE_PROTECTED) r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, &emulation_type); if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; else if (r == RET_PF_EMULATE) vcpu->stat.pf_emulate++; else if (r == RET_PF_SPURIOUS) vcpu->stat.pf_spurious++; if (r != RET_PF_EMULATE) return 1; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; int root_level, leaf, level; leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); if (unlikely(leaf < 0)) return; pr_err("%s %llx", msg, gpa); for (level = root_level; level >= leaf; level--) pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); pr_cont("\n"); } EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; vcpu_clear_mmio_info(vcpu, addr); /* * Walking and synchronizing SPTEs both assume they are operating in * the context of the current MMU, and would need to be reworked if * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. */ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) return; if (!VALID_PAGE(root_hpa)) return; write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); if (sp->unsync) { int ret = kvm_sync_spte(vcpu, sp, iterator.index); if (ret < 0) mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); if (ret) kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); } if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); } void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots) { int i; WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_invlpg_address(addr, vcpu)) return; kvm_x86_call(flush_tlb_gva)(vcpu, addr); } if (!mmu->sync_spte) return; if (roots & KVM_MMU_ROOT_CURRENT) __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (roots & KVM_MMU_ROOT_PREVIOUS(i)) __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { /* * INVLPG is required to invalidate any global mappings for the VA, * irrespective of PCID. Blindly sync all roots as it would take * roughly the same amount of work/time to determine whether any of the * previous roots have a global mapping. * * Mappings not reachable via the current or previous cached roots will * be synced when switching to that new cr3, so nothing needs to be * done here for them. */ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; unsigned long roots = 0; uint i; if (pcid == kvm_get_active_pcid(vcpu)) roots |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) roots |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots) kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); ++vcpu->stat.invlpg; /* * Mappings not reachable via the current cr3 or the prev_roots will be * synced when switching to that cr3, so nothing needs to be done here * for them. */ } void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; #ifdef CONFIG_X86_64 tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; #endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) max_huge_page_level = PG_LEVEL_1G; else max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) return 0; /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU * creation. When emulating 32-bit mode, cr3 is only 32 bits even on * x86_64. Therefore we need to allocate the PDP table in the first * 4GB of memory, which happens to fit the DMA32 zone. TDP paging * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; mmu->pae_root = page_address(page); /* * CR3 is only 32 bits when PAE paging is used, thus it's impossible to * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so * that KVM's writes and the CPU's reads get along. Note, this is * only necessary when using shadow paging, as 64-bit NPT can get at * the C-bit even when shadowing 32-bit NPT, and SME isn't supported * by 32-bit kernels (when KVM itself uses 32-bit NPT). */ if (!tdp_enabled) set_memory_decrypted((unsigned long)mmu->pae_root, 1); else WARN_ON_ONCE(shadow_me_value); for (i = 0; i < 4; ++i) mmu->pae_root[i] = INVALID_PAE_ROOT; return 0; } int kvm_mmu_create(struct kvm_vcpu *vcpu) { int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE; if (!vcpu->arch.mmu_shadow_page_cache.init_value) vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; return ret; fail_allocate_root: free_mmu_pages(&vcpu->arch.guest_mmu); return ret; } #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; LIST_HEAD(invalid_list); bool unstable; lockdep_assert_held(&kvm->slots_lock); restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { /* * No obsolete valid page exists before a newly created page * since active_mmu_pages is a FIFO list. */ if (!is_obsolete_sp(kvm, sp)) break; /* * Invalid pages should never land back on the list of active * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again). */ if (WARN_ON_ONCE(sp->role.invalid)) continue; /* * No need to flush the TLB since we're only zapping shadow * pages with an obsolete generation number and all vCPUS have * loaded a new root, i.e. the shadow pages being zapped cannot * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) goto restart; } /* * Kick all vCPUs (via remote TLB flush) before freeing the page tables * to ensure KVM is not in the middle of a lockless shadow page table * walk, which may reference the pages. The remote TLB flush itself is * not required and is simply a convenient way to kick vCPUs as needed. * KVM performs a local TLB flush when allocating a new root (see * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* * Fast invalidate all shadow pages and use lock-break technique * to zap obsolete pages. * * It's required when memslot is being deleted or VM is being * destroyed, in these cases, we should ensure that KVM MMU does * not use any resource of the being-deleted slot or all slots * after calling the function. */ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is * held for the entire duration of zapping obsolete pages, it's * impossible for there to be multiple invalid generations associated * with *valid* shadow pages at any given time, i.e. there is exactly * one valid generation and (at most) one invalid generation. */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; /* * In order to ensure all vCPUs drop their soon-to-be invalid roots, * invalidating TDP MMU roots must be done while holding mmu_lock for * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ if (tdp_mmu_enabled) kvm_tdp_mmu_invalidate_all_roots(kvm); /* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new * mmu_valid_gen. * * Note: we need to do this under the protection of mmu_lock, * otherwise, vcpu would purge shadow page but miss tlb flush. */ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); kvm_zap_obsolete_pages(kvm); write_unlock(&kvm->mmu_lock); /* * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before * returning to the caller, e.g. if the zap is in response to a memslot * deletion, mmu_notifier callbacks will be unable to reach the SPTEs * associated with the deleted memslot once the update completes, and * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm); } void kvm_mmu_init_vm(struct kvm *kvm) { kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); if (tdp_mmu_enabled) kvm_mmu_init_tdp_mmu(kvm); kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; } static void mmu_free_vm_memory_caches(struct kvm *kvm) { kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); } void kvm_mmu_uninit_vm(struct kvm *kvm) { if (tdp_mmu_enabled) kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; struct kvm_memslot_iter iter; bool flush = false; gfn_t start, end; int i; if (!kvm_memslots_have_rmaps(kvm)) return flush; for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { memslot = iter.slot; start = max(gfn_start, memslot->base_gfn); end = min(gfn_end, memslot->base_gfn + memslot->npages); if (WARN_ON_ONCE(start >= end)) continue; flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, end, true, flush); } } return flush; } /* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it) */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { bool flush; if (WARN_ON_ONCE(gfn_end <= gfn_start)) return; write_lock(&kvm->mmu_lock); kvm_mmu_invalidate_begin(kvm); kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); kvm_mmu_invalidate_end(kvm); write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { return rmap_write_protect(rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } } static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) { return kvm_mmu_memory_cache_nr_free_objects(cache) < min; } static bool need_topup_split_caches_or_resched(struct kvm *kvm) { if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) return true; /* * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed * to split a single huge page. Calculating how many are actually needed * is possible but not worth the complexity. */ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || need_topup(&kvm->arch.split_page_header_cache, 1) || need_topup(&kvm->arch.split_shadow_page_cache, 1); } static int topup_split_caches(struct kvm *kvm) { /* * Allocating rmap list entries when splitting huge pages for nested * MMUs is uncommon as KVM needs to use a list if and only if there is * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be * aliased by multiple L2 gfns and/or from multiple nested roots with * different roles. Aliasing gfns when using TDP is atypical for VMMs; * a few gfns are often aliased during boot, e.g. when remapping BIOS, * but aliasing rarely occurs post-boot or for many gfns. If there is * only one rmap entry, rmap->val points directly at that one entry and * doesn't need to allocate a list. Buffer the cache by the default * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM * encounters an aliased gfn or two. */ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; int r; lockdep_assert_held(&kvm->slots_lock); r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, SPLIT_DESC_CACHE_MIN_NR_OBJECTS); if (r) return r; r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); if (r) return r; return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); } static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) { struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); struct shadow_page_caches caches = {}; union kvm_mmu_page_role role; unsigned int access; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); /* * Note, huge page splitting always uses direct shadow pages, regardless * of whether the huge page itself is mapped by a direct or indirect * shadow page, since the huge page region itself is being directly * mapped with smaller pages. */ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); /* Direct SPs do not require a shadowed_info_cache. */ caches.page_header_cache = &kvm->arch.split_page_header_cache; caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; /* Safe to pass NULL for vCPU since requesting a direct SP. */ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); } static void shadow_mmu_split_huge_page(struct kvm *kvm, const struct kvm_memory_slot *slot, u64 *huge_sptep) { struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; u64 huge_spte = READ_ONCE(*huge_sptep); struct kvm_mmu_page *sp; bool flush = false; u64 *sptep, spte; gfn_t gfn; int index; sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { sptep = &sp->spt[index]; gfn = kvm_mmu_page_get_gfn(sp, index); /* * The SP may already have populated SPTEs, e.g. if this huge * page is aliased by multiple sptes with the same access * permissions. These entries are guaranteed to map the same * gfn-to-pfn translation since the SP is direct, so no need to * modify them. * * However, if a given SPTE points to a lower level page table, * that lower level page table may only be partially populated. * Installing such SPTEs would effectively unmap a potion of the * huge page. Unmapping guest memory always requires a TLB flush * since a subsequent operation on the unmapped regions would * fail to detect the need to flush. */ if (is_shadow_present_pte(*sptep)) { flush |= !is_last_spte(*sptep, sp->role.level); continue; } spte = make_small_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } __link_shadow_page(kvm, cache, huge_sptep, sp, flush); } static int shadow_mmu_try_split_huge_page(struct kvm *kvm, const struct kvm_memory_slot *slot, u64 *huge_sptep) { struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); int level, r = 0; gfn_t gfn; u64 spte; /* Grab information for the tracepoint before dropping the MMU lock. */ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); level = huge_sp->role.level; spte = *huge_sptep; if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { r = -ENOSPC; goto out; } if (need_topup_split_caches_or_resched(kvm)) { write_unlock(&kvm->mmu_lock); cond_resched(); /* * If the topup succeeds, return -EAGAIN to indicate that the * rmap iterator should be restarted because the MMU lock was * dropped. */ r = topup_split_caches(kvm) ?: -EAGAIN; write_lock(&kvm->mmu_lock); goto out; } shadow_mmu_split_huge_page(kvm, slot, huge_sptep); out: trace_kvm_mmu_split_huge_page(gfn, spte, level, r); return r; } static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { struct rmap_iterator iter; struct kvm_mmu_page *sp; u64 *huge_sptep; int r; restart: for_each_rmap_spte(rmap_head, &iter, huge_sptep) { sp = sptep_to_sp(huge_sptep); /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ if (WARN_ON_ONCE(!sp->role.guest_mode)) continue; /* The rmaps should never contain non-leaf SPTEs. */ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) continue; /* SPs with level >PG_LEVEL_4K should never by unsync. */ if (WARN_ON_ONCE(sp->unsync)) continue; /* Don't bother splitting huge pages on invalid SPs. */ if (sp->role.invalid) continue; r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); /* * The split succeeded or needs to be retried because the MMU * lock was dropped. Either way, restart the iterator to get it * back into a consistent state. */ if (!r || r == -EAGAIN) goto restart; /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ break; } return false; } static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level) { int level; /* * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working * down to the target level. This ensures pages are recursively split * all the way to the target level. There's no need to split pages * already at the target level. */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same reasons as in * kvm_mmu_slot_try_split_huge_pages(). */ } void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, int target_level) { u64 start = memslot->base_gfn; u64 end = start + memslot->npages; if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); write_unlock(&kvm->mmu_lock); } read_lock(&kvm->mmu_lock); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); read_unlock(&kvm->mmu_lock); /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to * ensure that guest writes are reflected in the dirty log before the * ioctl to enable dirty logging on this memslot completes. Since the * split SPTEs retain the write and dirty bits of the huge SPTE, it is * safe for KVM to decide if a TLB flush is necessary based on the split * SPTEs. */ } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); /* * We cannot do huge page mapping for indirect shadow pages, * which are found on the last rmap (level = 1) when not using * tdp; such shadow pages are synced with the page table in * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; goto restart; } } return need_tlb_flush; } EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { /* * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap * pages that are already mapped at the maximum hugepage level. */ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) kvm_flush_remote_tlbs_memslot(kvm, slot); } void kvm_mmu_recover_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* * The caller will flush the TLBs after this function returns. * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ } static void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); int ign; write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON_ONCE(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); } void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); } static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, struct kvm_memory_slot *slot, bool flush) { LIST_HEAD(invalid_list); unsigned long i; if (list_empty(&kvm->arch.active_mmu_pages)) goto out_flush; /* * Since accounting information is stored in struct kvm_arch_memory_slot, * all MMU pages that are shadowing guest PTEs must be zapped before the * memslot is deleted, as freeing such pages after the memslot is freed * will result in use-after-free, e.g. in unaccount_shadowed(). */ for (i = 0; i < slot->npages; i++) { struct kvm_mmu_page *sp; gfn_t gfn = slot->base_gfn + i; for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); flush = false; cond_resched_rwlock_write(&kvm->mmu_lock); } } out_flush: kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); } static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_gfn_range range = { .slot = slot, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, }; bool flush; write_lock(&kvm->mmu_lock); flush = kvm_unmap_gfn_range(kvm, &range); kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush); write_unlock(&kvm->mmu_lock); } static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) { return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { if (kvm_memslot_flush_zap_all(kvm)) kvm_mmu_zap_all_fast(kvm); else kvm_mmu_zap_memslot(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); gen &= MMIO_SPTE_GEN_MASK; /* * Generation numbers are incremented in multiples of the number of * address spaces in order to provide unique generations across all * address spaces. Strip what is effectively the address space * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected. */ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1); /* * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } static void mmu_destroy_caches(void) { kmem_cache_destroy(pte_list_desc_cache); kmem_cache_destroy(mmu_page_header_cache); } static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); } static void __set_nx_huge_pages(bool val) { nx_huge_pages = itlb_multihit_kvm_mitigation = val; } static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) { bool old_val = nx_huge_pages; bool new_val; if (nx_hugepage_mitigation_hard_disabled) return -EPERM; /* In "auto" mode deploy workaround only if CPU has the bug. */ if (sysfs_streq(val, "off")) { new_val = 0; } else if (sysfs_streq(val, "force")) { new_val = 1; } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); } else if (sysfs_streq(val, "never")) { new_val = 0; mutex_lock(&kvm_lock); if (!list_empty(&vm_list)) { mutex_unlock(&kvm_lock); return -EBUSY; } nx_hugepage_mitigation_hard_disabled = true; mutex_unlock(&kvm_lock); } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; } __set_nx_huge_pages(new_val); if (new_val != old_val) { struct kvm *kvm; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { mutex_lock(&kvm->slots_lock); kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } return 0; } /* * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as * its default value of -1 is technically undefined behavior for a boolean. * Forward the module init call to SPTE code so that it too can handle module * params that need to be resolved/snapshot. */ void __init kvm_mmu_x86_module_init(void) { if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); /* * Snapshot userspace's desire to enable the TDP MMU. Whether or not the * TDP MMU is actually enabled is determined in kvm_configure_mmu() * when the vendor module is loaded. */ tdp_mmu_allowed = tdp_mmu_enabled; kvm_mmu_spte_module_init(); } /* * The bulk of the MMU initialization is deferred until the vendor module is * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need * to be reset when a potentially different vendor module is loaded. */ int kvm_mmu_vendor_module_init(void) { int ret = -ENOMEM; /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave * and the current status quo is unlikely to change. Guardians below are * supposed to let us know if the assumption becomes false. */ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64)); kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) goto out; return 0; out: mmu_destroy_caches(); return ret; } void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); free_mmu_pages(&vcpu->arch.root_mmu); free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); } void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); } /* * Calculate the effective recovery period, accounting for '0' meaning "let KVM * select a halving time of 1 hour". Returns true if recovery is enabled. */ static bool calc_nx_huge_pages_recovery_period(uint *period) { /* * Use READ_ONCE to get the params, this may be called outside of the * param setters, e.g. by the kthread to compute its next timeout. */ bool enabled = READ_ONCE(nx_huge_pages); uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); if (!enabled || !ratio) return false; *period = READ_ONCE(nx_huge_pages_recovery_period_ms); if (!*period) { /* Make sure the period is not less than one second. */ ratio = min(ratio, 3600u); *period = 60 * 60 * 1000 / ratio; } return true; } static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; if (nx_hugepage_mitigation_hard_disabled) return -EPERM; was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } return err; } static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); /* * Zapping TDP MMU shadow pages, including the remote TLB flush, must * be done under RCU protection, because the pages are freed via RCU * callback. */ rcu_read_lock(); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages * because the number of shadow pages that be replaced with an * NX huge page is expected to be relatively small compared to * the total number of shadow pages. And because the TDP MMU * doesn't use active_mmu_pages. */ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); /* * Unaccount and do not attempt to recover any NX Huge Pages * that are being dirty tracked, as they would just be faulted * back in as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. * * Since gfn_to_memslot() is relatively expensive, it helps to * skip it if it the test cannot possibly return true. On the * other hand, if any memslot has logging enabled, chances are * good that all of them do, in which case unaccount_nx_huge_page() * is much cheaper than zapping the page. * * If a memslot update is in progress, reading an incorrect value * of kvm->nr_memslots_dirty_logging is not a problem: if it is * becoming zero, gfn_to_memslot() will be done unnecessarily; if * it is becoming nonzero, the page will be zapped unnecessarily. * Either way, this only affects efficiency in racy situations, * and not correctness. */ slot = NULL; if (atomic_read(&kvm->nr_memslots_dirty_logging)) { struct kvm_memslots *slots; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); WARN_ON_ONCE(!slot); } if (slot && kvm_slot_dirty_track_enabled(slot)) unaccount_nx_huge_page(kvm, sp); else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); cond_resched_rwlock_write(&kvm->mmu_lock); flush = false; rcu_read_lock(); } } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } static void kvm_nx_huge_page_recovery_worker_kill(void *data) { } static bool kvm_nx_huge_page_recovery_worker(void *data) { struct kvm *kvm = data; bool enabled; uint period; long remaining_time; enabled = calc_nx_huge_pages_recovery_period(&period); if (!enabled) return false; remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) - get_jiffies_64(); if (remaining_time > 0) { schedule_timeout(remaining_time); /* check for signals and come back */ return true; } __set_current_state(TASK_RUNNING); kvm_recover_nx_huge_pages(kvm); kvm->arch.nx_huge_page_last = get_jiffies_64(); return true; } int kvm_mmu_post_init_vm(struct kvm *kvm) { if (nx_hugepage_mitigation_hard_disabled) return 0; kvm->arch.nx_huge_page_last = get_jiffies_64(); kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); if (!kvm->arch.nx_huge_page_recovery_thread) return -ENOMEM; vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); return 0; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM * can simply ignore such slots. But if userspace is making memory * PRIVATE, then KVM must prevent the guest from accessing the memory * as shared. And if userspace is making memory SHARED and this point * is reached, then at least one page within the range was previously * PRIVATE, i.e. the slot's possible hugepage ranges are changing. * Zapping SPTEs in this case ensures KVM will reassess whether or not * a hugepage can be used for affected ranges. */ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; return kvm_unmap_gfn_range(kvm, range); } static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; } static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; } static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; } static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) { const unsigned long start = gfn; const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); if (level == PG_LEVEL_2M) return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { if (hugepage_test_mixed(slot, gfn, level - 1) || attrs != kvm_get_memory_attributes(kvm, gfn)) return false; } return true; } bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { unsigned long attrs = range->arg.attributes; struct kvm_memory_slot *slot = range->slot; int level; lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); /* * Calculate which ranges can be mapped with hugepages even if the slot * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over * a range that has PRIVATE GFNs, and conversely converting a range to * SHARED may now allow hugepages. */ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; /* * The sequence matters here: upper levels consume the result of lower * level's scanning. */ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); gfn_t gfn = gfn_round_for_level(range->start, level); /* Process the head page if it straddles the range. */ if (gfn != range->start || gfn + nr_pages > range->end) { /* * Skip mixed tracking if the aligned gfn isn't covered * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ if (gfn >= slot->base_gfn && gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } gfn += nr_pages; } /* * Pages entirely covered by the range are guaranteed to have * only the attributes which were just set. */ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) hugepage_clear_mixed(slot, gfn, level); /* * Process the last tail page if it straddles the range and is * contained by the memslot. Like the head page, KVM can't * create a hugepage if the slot size is misaligned. */ if (gfn < range->end && (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } } return false; } void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, struct kvm_memory_slot *slot) { int level; if (!kvm_arch_has_private_mem(kvm)) return; for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { /* * Don't bother tracking mixed attributes for pages that can't * be huge due to alignment, i.e. process only pages that are * entirely contained by the memslot. */ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); gfn_t start = gfn_round_for_level(slot->base_gfn, level); gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); gfn_t gfn; if (start < slot->base_gfn) start += nr_pages; /* * Unlike setting attributes, every potential hugepage needs to * be manually checked as the attributes may already be mixed. */ for (gfn = start; gfn < end; gfn += nr_pages) { unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } } } #endif
7 4 1 1 1 1 10 10 14 14 10 10 10 14 14 14 14 14 14 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Red Hat * * based in parts on udlfb.c: * Copyright (C) 2009 Roberto De Ioris <roberto@unbit.it> * Copyright (C) 2009 Jaya Kumar <jayakumar.lkml@gmail.com> * Copyright (C) 2009 Bernie Thompson <bernie@plugable.com> */ #include <drm/drm.h> #include <drm/drm_print.h> #include <drm/drm_probe_helper.h> #include "udl_drv.h" /* -BULK_SIZE as per usb-skeleton. Can we get full page and avoid overhead? */ #define BULK_SIZE 512 #define NR_USB_REQUEST_CHANNEL 0x12 #define MAX_TRANSFER (PAGE_SIZE*16 - BULK_SIZE) #define WRITES_IN_FLIGHT (20) #define MAX_VENDOR_DESCRIPTOR_SIZE 256 static struct urb *udl_get_urb_locked(struct udl_device *udl, long timeout); static int udl_parse_vendor_descriptor(struct udl_device *udl) { struct usb_device *udev = udl_to_usb_device(udl); char *desc; char *buf; char *desc_end; u8 total_len = 0; buf = kzalloc(MAX_VENDOR_DESCRIPTOR_SIZE, GFP_KERNEL); if (!buf) return false; desc = buf; total_len = usb_get_descriptor(udev, 0x5f, /* vendor specific */ 0, desc, MAX_VENDOR_DESCRIPTOR_SIZE); if (total_len > 5) { DRM_INFO("vendor descriptor length:%x data:%11ph\n", total_len, desc); if ((desc[0] != total_len) || /* descriptor length */ (desc[1] != 0x5f) || /* vendor descriptor type */ (desc[2] != 0x01) || /* version (2 bytes) */ (desc[3] != 0x00) || (desc[4] != total_len - 2)) /* length after type */ goto unrecognized; desc_end = desc + total_len; desc += 5; /* the fixed header we've already parsed */ while (desc < desc_end) { u8 length; u16 key; key = le16_to_cpu(*((u16 *) desc)); desc += sizeof(u16); length = *desc; desc++; switch (key) { case 0x0200: { /* max_area */ u32 max_area; max_area = le32_to_cpu(*((u32 *)desc)); DRM_DEBUG("DL chip limited to %d pixel modes\n", max_area); udl->sku_pixel_limit = max_area; break; } default: break; } desc += length; } } goto success; unrecognized: /* allow udlfb to load for now even if firmware unrecognized */ DRM_ERROR("Unrecognized vendor firmware descriptor\n"); success: kfree(buf); return true; } /* * Need to ensure a channel is selected before submitting URBs */ int udl_select_std_channel(struct udl_device *udl) { static const u8 set_def_chn[] = {0x57, 0xCD, 0xDC, 0xA7, 0x1C, 0x88, 0x5E, 0x15, 0x60, 0xFE, 0xC6, 0x97, 0x16, 0x3D, 0x47, 0xF2}; void *sendbuf; int ret; struct usb_device *udev = udl_to_usb_device(udl); sendbuf = kmemdup(set_def_chn, sizeof(set_def_chn), GFP_KERNEL); if (!sendbuf) return -ENOMEM; ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), NR_USB_REQUEST_CHANNEL, (USB_DIR_OUT | USB_TYPE_VENDOR), 0, 0, sendbuf, sizeof(set_def_chn), USB_CTRL_SET_TIMEOUT); kfree(sendbuf); return ret < 0 ? ret : 0; } void udl_urb_completion(struct urb *urb) { struct urb_node *unode = urb->context; struct udl_device *udl = unode->dev; unsigned long flags; /* sync/async unlink faults aren't errors */ if (urb->status) { if (!(urb->status == -ENOENT || urb->status == -ECONNRESET || urb->status == -EPROTO || urb->status == -ESHUTDOWN)) { DRM_ERROR("%s - nonzero write bulk status received: %d\n", __func__, urb->status); } } urb->transfer_buffer_length = udl->urbs.size; /* reset to actual */ spin_lock_irqsave(&udl->urbs.lock, flags); list_add_tail(&unode->entry, &udl->urbs.list); udl->urbs.available++; spin_unlock_irqrestore(&udl->urbs.lock, flags); wake_up(&udl->urbs.sleep); } static void udl_free_urb_list(struct drm_device *dev) { struct udl_device *udl = to_udl(dev); struct urb_node *unode; struct urb *urb; DRM_DEBUG("Waiting for completes and freeing all render urbs\n"); /* keep waiting and freeing, until we've got 'em all */ while (udl->urbs.count) { spin_lock_irq(&udl->urbs.lock); urb = udl_get_urb_locked(udl, MAX_SCHEDULE_TIMEOUT); udl->urbs.count--; spin_unlock_irq(&udl->urbs.lock); if (WARN_ON(!urb)) break; unode = urb->context; /* Free each separately allocated piece */ usb_free_coherent(urb->dev, udl->urbs.size, urb->transfer_buffer, urb->transfer_dma); usb_free_urb(urb); kfree(unode); } wake_up_all(&udl->urbs.sleep); } static int udl_alloc_urb_list(struct drm_device *dev, int count, size_t size) { struct udl_device *udl = to_udl(dev); struct urb *urb; struct urb_node *unode; char *buf; size_t wanted_size = count * size; struct usb_device *udev = udl_to_usb_device(udl); spin_lock_init(&udl->urbs.lock); INIT_LIST_HEAD(&udl->urbs.list); init_waitqueue_head(&udl->urbs.sleep); udl->urbs.count = 0; udl->urbs.available = 0; retry: udl->urbs.size = size; while (udl->urbs.count * size < wanted_size) { unode = kzalloc(sizeof(struct urb_node), GFP_KERNEL); if (!unode) break; unode->dev = udl; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) { kfree(unode); break; } unode->urb = urb; buf = usb_alloc_coherent(udev, size, GFP_KERNEL, &urb->transfer_dma); if (!buf) { kfree(unode); usb_free_urb(urb); if (size > PAGE_SIZE) { size /= 2; udl_free_urb_list(dev); goto retry; } break; } /* urb->transfer_buffer_length set to actual before submit */ usb_fill_bulk_urb(urb, udev, usb_sndbulkpipe(udev, 1), buf, size, udl_urb_completion, unode); urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; list_add_tail(&unode->entry, &udl->urbs.list); udl->urbs.count++; udl->urbs.available++; } DRM_DEBUG("allocated %d %d byte urbs\n", udl->urbs.count, (int) size); return udl->urbs.count; } static struct urb *udl_get_urb_locked(struct udl_device *udl, long timeout) { struct urb_node *unode; assert_spin_locked(&udl->urbs.lock); /* Wait for an in-flight buffer to complete and get re-queued */ if (!wait_event_lock_irq_timeout(udl->urbs.sleep, !udl->urbs.count || !list_empty(&udl->urbs.list), udl->urbs.lock, timeout)) { DRM_INFO("wait for urb interrupted: available: %d\n", udl->urbs.available); return NULL; } if (!udl->urbs.count) return NULL; unode = list_first_entry(&udl->urbs.list, struct urb_node, entry); list_del_init(&unode->entry); udl->urbs.available--; return unode->urb; } #define GET_URB_TIMEOUT HZ struct urb *udl_get_urb(struct drm_device *dev) { struct udl_device *udl = to_udl(dev); struct urb *urb; spin_lock_irq(&udl->urbs.lock); urb = udl_get_urb_locked(udl, GET_URB_TIMEOUT); spin_unlock_irq(&udl->urbs.lock); return urb; } int udl_submit_urb(struct drm_device *dev, struct urb *urb, size_t len) { struct udl_device *udl = to_udl(dev); int ret; if (WARN_ON(len > udl->urbs.size)) { ret = -EINVAL; goto error; } urb->transfer_buffer_length = len; /* set to actual payload len */ ret = usb_submit_urb(urb, GFP_ATOMIC); error: if (ret) { udl_urb_completion(urb); /* because no one else will */ DRM_ERROR("usb_submit_urb error %x\n", ret); } return ret; } /* wait until all pending URBs have been processed */ void udl_sync_pending_urbs(struct drm_device *dev) { struct udl_device *udl = to_udl(dev); spin_lock_irq(&udl->urbs.lock); /* 2 seconds as a sane timeout */ if (!wait_event_lock_irq_timeout(udl->urbs.sleep, udl->urbs.available == udl->urbs.count, udl->urbs.lock, msecs_to_jiffies(2000))) drm_err(dev, "Timeout for syncing pending URBs\n"); spin_unlock_irq(&udl->urbs.lock); } int udl_init(struct udl_device *udl) { struct drm_device *dev = &udl->drm; int ret = -ENOMEM; DRM_DEBUG("\n"); udl->dmadev = usb_intf_get_dma_device(to_usb_interface(dev->dev)); if (!udl->dmadev) drm_warn(dev, "buffer sharing not supported"); /* not an error */ mutex_init(&udl->gem_lock); if (!udl_parse_vendor_descriptor(udl)) { ret = -ENODEV; DRM_ERROR("firmware not recognized. Assume incompatible device\n"); goto err; } if (udl_select_std_channel(udl)) DRM_ERROR("Selecting channel failed\n"); if (!udl_alloc_urb_list(dev, WRITES_IN_FLIGHT, MAX_TRANSFER)) { DRM_ERROR("udl_alloc_urb_list failed\n"); goto err; } DRM_DEBUG("\n"); ret = udl_modeset_init(dev); if (ret) goto err; drm_kms_helper_poll_init(dev); return 0; err: if (udl->urbs.count) udl_free_urb_list(dev); put_device(udl->dmadev); DRM_ERROR("%d\n", ret); return ret; } int udl_drop_usb(struct drm_device *dev) { struct udl_device *udl = to_udl(dev); udl_free_urb_list(dev); put_device(udl->dmadev); udl->dmadev = NULL; return 0; }
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" #include "xfs_log.h" #include "xfs_error.h" static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_dq_logitem, qli_item); } /* * returns the number of iovecs needed to log the given dquot item. */ STATIC void xfs_qm_dquot_logitem_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { *nvecs += 2; *nbytes += sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot); } /* * fills in the vector of log iovecs for the given dquot log item. */ STATIC void xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_disk_dquot ddq; struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_log_iovec *vecp = NULL; struct xfs_dq_logformat *qlf; qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); qlf->qlf_type = XFS_LI_DQUOT; qlf->qlf_size = 2; qlf->qlf_id = qlip->qli_dquot->q_id; qlf->qlf_blkno = qlip->qli_dquot->q_blkno; qlf->qlf_len = 1; qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); xfs_dquot_to_disk(&ddq, qlip->qli_dquot); xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq, sizeof(struct xfs_disk_dquot)); } /* * Increment the pin count of the given dquot. */ STATIC void xfs_qm_dquot_logitem_pin( struct xfs_log_item *lip) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); atomic_inc(&dqp->q_pincount); } /* * Decrement the pin count of the given dquot, and wake up * anyone in xfs_dqwait_unpin() if the count goes to 0. The * dquot must have been previously pinned with a call to * xfs_qm_dquot_logitem_pin(). */ STATIC void xfs_qm_dquot_logitem_unpin( struct xfs_log_item *lip, int remove) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(atomic_read(&dqp->q_pincount) > 0); if (atomic_dec_and_test(&dqp->q_pincount)) wake_up(&dqp->q_pinwait); } /* * This is called to wait for the given dquot to be unpinned. * Most of these pin/unpin routines are plagiarized from inode code. */ void xfs_qm_dqunpin_wait( struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); if (atomic_read(&dqp->q_pincount) == 0) return; /* * Give the log a push so we don't wait here too long. */ xfs_log_force(dqp->q_mount, 0); wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, struct list_head *buffer_list) __releases(&lip->li_ailp->ail_lock) __acquires(&lip->li_ailp->ail_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; /* * Re-check the pincount now that we stabilized the value by * taking the quota lock. */ if (atomic_read(&dqp->q_pincount) > 0) { rval = XFS_ITEM_PINNED; goto out_unlock; } /* * Someone else is already flushing the dquot. Nothing we can do * here but wait for the flush to finish and remove the item from * the AIL. */ if (!xfs_dqflock_nowait(dqp)) { rval = XFS_ITEM_FLUSHING; goto out_unlock; } spin_unlock(&lip->li_ailp->ail_lock); error = xfs_qm_dqflush(dqp, &bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); } else if (error == -EAGAIN) rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_dqunlock(dqp); return rval; } STATIC void xfs_qm_dquot_logitem_release( struct xfs_log_item *lip) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* * dquots are never 'held' from getting unlocked at the end of * a transaction. Their locking and unlocking is hidden inside the * transaction layer, within trans_commit. Hence, no LI_HOLD flag * for the logitem. */ xfs_dqunlock(dqp); } STATIC void xfs_qm_dquot_logitem_committing( struct xfs_log_item *lip, xfs_csn_t seq) { return xfs_qm_dquot_logitem_release(lip); } #ifdef DEBUG_EXPENSIVE static int xfs_qm_dquot_logitem_precommit( struct xfs_trans *tp, struct xfs_log_item *lip) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_mount *mp = dqp->q_mount; struct xfs_disk_dquot ddq = { }; xfs_failaddr_t fa; xfs_dquot_to_disk(&ddq, dqp); fa = xfs_dquot_verify(mp, &ddq, dqp->q_id); if (fa) { XFS_CORRUPTION_ERROR("Bad dquot during logging", XFS_ERRLEVEL_LOW, mp, &ddq, sizeof(ddq)); xfs_alert(mp, "Metadata corruption detected at %pS, dquot 0x%x", fa, dqp->q_id); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ASSERT(fa == NULL); } return 0; } #else # define xfs_qm_dquot_logitem_precommit NULL #endif static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_precommit = xfs_qm_dquot_logitem_precommit, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, .iop_release = xfs_qm_dquot_logitem_release, .iop_committing = xfs_qm_dquot_logitem_committing, .iop_push = xfs_qm_dquot_logitem_push, }; /* * Initialize the dquot log item for a newly allocated dquot. * The dquot isn't locked at this point, but it isn't on any of the lists * either, so we don't care. */ void xfs_qm_dquot_logitem_init( struct xfs_dquot *dqp) { struct xfs_dq_logitem *lp = &dqp->q_logitem; xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); lp->qli_dquot = dqp; }
13 13 13 13 13 12 13 13 4 3 2 3 1 2 180 1 2 179 179 287 281 7 6 4 2 1 2 7 31 31 31 18 13 13 13 1 84 53 31 13 18 12 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 // SPDX-License-Identifier: GPL-2.0-only /* * Yama Linux Security Module * * Author: Kees Cook <keescook@chromium.org> * * Copyright (C) 2010 Canonical, Ltd. * Copyright (C) 2011 The Chromium OS Authors. */ #include <linux/lsm_hooks.h> #include <linux/sysctl.h> #include <linux/ptrace.h> #include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/string_helpers.h> #include <linux/task_work.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <uapi/linux/lsm.h> #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 #define YAMA_SCOPE_CAPABILITY 2 #define YAMA_SCOPE_NO_ATTACH 3 static int ptrace_scope = YAMA_SCOPE_RELATIONAL; /* describe a ptrace relationship for potential exception */ struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; bool invalid; struct list_head node; struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); static void yama_relation_cleanup(struct work_struct *work); static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); struct access_report_info { struct callback_head work; const char *access; struct task_struct *target; struct task_struct *agent; }; static void __report_access(struct callback_head *work) { struct access_report_info *info = container_of(work, struct access_report_info, work); char *target_cmd, *agent_cmd; target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL); agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL); pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", info->access, target_cmd, info->target->pid, agent_cmd, info->agent->pid); kfree(agent_cmd); kfree(target_cmd); put_task_struct(info->agent); put_task_struct(info->target); kfree(info); } /* defers execution because cmdline access can sleep */ static void report_access(const char *access, struct task_struct *target, struct task_struct *agent) { struct access_report_info *info; char agent_comm[sizeof(agent->comm)]; assert_spin_locked(&target->alloc_lock); /* for target->comm */ if (current->flags & PF_KTHREAD) { /* I don't think kthreads call task_work_run() before exiting. * Imagine angry ranting about procfs here. */ pr_notice_ratelimited( "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n", access, target->comm, target->pid, get_task_comm(agent_comm, agent), agent->pid); return; } info = kmalloc(sizeof(*info), GFP_ATOMIC); if (!info) return; init_task_work(&info->work, __report_access); get_task_struct(target); get_task_struct(agent); info->access = access; info->target = target; info->agent = agent; if (task_work_add(current, &info->work, TWA_RESUME) == 0) return; /* success */ WARN(1, "report_access called from exiting task"); put_task_struct(target); put_task_struct(agent); kfree(info); } /** * yama_relation_cleanup - remove invalid entries from the relation list * @work: unused * */ static void yama_relation_cleanup(struct work_struct *work) { struct ptrace_relation *relation; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) { list_del_rcu(&relation->node); kfree_rcu(relation, rcu); } } rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); } /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace * @tracee: the task_struct of the process to be ptraced * * Each tracee can have, at most, one tracer registered. Each time this * is called, the prior registered tracer will be replaced for the tracee. * * Returns 0 if relationship was added, -ve on error. */ static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation, *added; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; added->tracee = tracee; added->tracer = tracer; added->invalid = false; spin_lock(&ptracer_relations_lock); rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { list_replace_rcu(&relation->node, &added->node); kfree_rcu(relation, rcu); goto out; } } list_add_rcu(&added->node, &ptracer_relations); out: rcu_read_unlock(); spin_unlock(&ptracer_relations_lock); return 0; } /** * yama_ptracer_del - remove exceptions related to the given tasks * @tracer: remove any relation where tracer task matches * @tracee: remove any relation where tracee task matches */ static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { struct ptrace_relation *relation; bool marked = false; rcu_read_lock(); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { relation->invalid = true; marked = true; } } rcu_read_unlock(); if (marked) schedule_work(&yama_relation_work); } /** * yama_task_free - check for task_pid to remove from exception list * @task: task being removed */ static void yama_task_free(struct task_struct *task) { yama_ptracer_del(task, task); } /** * yama_task_prctl - check for Yama-specific prctl operations * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Return 0 on success, -ve on error. -ENOSYS is returned when Yama * does not handle the given option. */ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc = -ENOSYS; struct task_struct *myself = current; switch (option) { case PR_SET_PTRACER: /* Since a thread can call prctl(), find the group leader * before calling _add() or _del() on it, since we want * process-level granularity of control. The tracer group * leader checking is handled later when walking the ancestry * at the time of PTRACE_ATTACH check. */ rcu_read_lock(); if (!thread_group_leader(myself)) myself = rcu_dereference(myself->group_leader); get_task_struct(myself); rcu_read_unlock(); if (arg2 == 0) { yama_ptracer_del(NULL, myself); rc = 0; } else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) { rc = yama_ptracer_add(NULL, myself); } else { struct task_struct *tracer; tracer = find_get_task_by_vpid(arg2); if (!tracer) { rc = -EINVAL; } else { rc = yama_ptracer_add(tracer, myself); put_task_struct(tracer); } } put_task_struct(myself); break; } return rc; } /** * task_is_descendant - walk up a process family tree looking for a match * @parent: the process to compare against while walking up from child * @child: the process to start from while looking upwards for parent * * Returns 1 if child is a descendant of parent, 0 if not. */ static int task_is_descendant(struct task_struct *parent, struct task_struct *child) { int rc = 0; struct task_struct *walker = child; if (!parent || !child) return 0; rcu_read_lock(); if (!thread_group_leader(parent)) parent = rcu_dereference(parent->group_leader); while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); if (walker == parent) { rc = 1; break; } walker = rcu_dereference(walker->real_parent); } rcu_read_unlock(); return rc; } /** * ptracer_exception_found - tracer registered as exception for this tracee * @tracer: the task_struct of the process attempting ptrace * @tracee: the task_struct of the process to be ptraced * * Returns 1 if tracer has a ptracer exception ancestor for tracee. */ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *tracee) { int rc = 0; struct ptrace_relation *relation; struct task_struct *parent = NULL; bool found = false; rcu_read_lock(); /* * If there's already an active tracing relationship, then make an * exception for the sake of other accesses, like process_vm_rw(). */ parent = ptrace_parent(tracee); if (parent != NULL && same_thread_group(parent, tracer)) { rc = 1; goto unlock; } /* Look for a PR_SET_PTRACER relationship. */ if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); list_for_each_entry_rcu(relation, &ptracer_relations, node) { if (relation->invalid) continue; if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; unlock: rcu_read_unlock(); return rc; } /** * yama_ptrace_access_check - validate PTRACE_ATTACH calls * @child: task that current task is attempting to ptrace * @mode: ptrace attach mode * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc = 0; /* require ptrace target be a child of ptracer on attach */ if (mode & PTRACE_MODE_ATTACH) { switch (ptrace_scope) { case YAMA_SCOPE_DISABLED: /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: rcu_read_lock(); if (!pid_alive(child)) rc = -EPERM; if (!rc && !task_is_descendant(current, child) && !ptracer_exception_found(current, child) && !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: rcu_read_lock(); if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: rc = -EPERM; break; } } if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0) report_access("attach", child, current); return rc; } /** * yama_ptrace_traceme - validate PTRACE_TRACEME calls * @parent: task that will become the ptracer of the current task * * Returns 0 if following the ptrace is allowed, -ve on error. */ static int yama_ptrace_traceme(struct task_struct *parent) { int rc = 0; /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) rc = -EPERM; break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM; break; } if (rc) { task_lock(current); report_access("traceme", current, parent); task_unlock(current); } return rc; } static const struct lsm_id yama_lsmid = { .name = "yama", .id = LSM_ID_YAMA, }; static struct security_hook_list yama_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme), LSM_HOOK_INIT(task_prctl, yama_task_prctl), LSM_HOOK_INIT(task_free, yama_task_free), }; #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; if (write && !capable(CAP_SYS_PTRACE)) return -EPERM; /* Lock the max value if it ever gets set. */ table_copy = *table; if (*(int *)table_copy.data == *(int *)table_copy.extra2) table_copy.extra1 = table_copy.extra2; return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); } static int max_scope = YAMA_SCOPE_NO_ATTACH; static struct ctl_table yama_sysctl_table[] = { { .procname = "ptrace_scope", .data = &ptrace_scope, .maxlen = sizeof(int), .mode = 0644, .proc_handler = yama_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &max_scope, }, }; static void __init yama_init_sysctl(void) { if (!register_sysctl("kernel/yama", yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); } #else static inline void yama_init_sysctl(void) { } #endif