Total coverage: 369401 (19%)of 2011523
2 6 10 10 10 8 7 1 10 6 1 1 1 2 2 1 4 3 5 5 5 5 5 5 5 1 4 4 4 3 5 5 5 5 5 5 5 5 5 6 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/fat/inode.c * * Written 1992,1993 by Werner Almesberger * VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner * Rewritten for the constant inumbers support by Al Viro * * Fixes: * * Max Cohan: Fixed invalid FSINFO offset when info_sector is 0 */ #include <linux/module.h> #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/vfs.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/unaligned.h> #include <linux/random.h> #include <linux/iversion.h> #include "fat.h" #ifndef CONFIG_FAT_DEFAULT_IOCHARSET /* if user don't select VFAT, this is undefined. */ #define CONFIG_FAT_DEFAULT_IOCHARSET "" #endif #define KB_IN_SECTORS 2 /* DOS dates from 1980/1/1 through 2107/12/31 */ #define FAT_DATE_MIN (0<<9 | 1<<5 | 1) #define FAT_DATE_MAX (127<<9 | 12<<5 | 31) #define FAT_TIME_MAX (23<<11 | 59<<5 | 29) /* * A deserialized copy of the on-disk structure laid out in struct * fat_boot_sector. */ struct fat_bios_param_block { u16 fat_sector_size; u8 fat_sec_per_clus; u16 fat_reserved; u8 fat_fats; u16 fat_dir_entries; u16 fat_sectors; u16 fat_fat_length; u32 fat_total_sect; u8 fat16_state; u32 fat16_vol_id; u32 fat32_length; u32 fat32_root_cluster; u16 fat32_info_sector; u8 fat32_state; u32 fat32_vol_id; }; static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE; static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET; static struct fat_floppy_defaults { unsigned nr_sectors; unsigned sec_per_clus; unsigned dir_entries; unsigned media; unsigned fat_length; } floppy_defaults[] = { { .nr_sectors = 160 * KB_IN_SECTORS, .sec_per_clus = 1, .dir_entries = 64, .media = 0xFE, .fat_length = 1, }, { .nr_sectors = 180 * KB_IN_SECTORS, .sec_per_clus = 1, .dir_entries = 64, .media = 0xFC, .fat_length = 2, }, { .nr_sectors = 320 * KB_IN_SECTORS, .sec_per_clus = 2, .dir_entries = 112, .media = 0xFF, .fat_length = 1, }, { .nr_sectors = 360 * KB_IN_SECTORS, .sec_per_clus = 2, .dir_entries = 112, .media = 0xFD, .fat_length = 2, }, }; int fat_add_cluster(struct inode *inode) { int err, cluster; err = fat_alloc_clusters(inode, &cluster, 1); if (err) return err; /* FIXME: this cluster should be added after data of this * cluster is writed */ err = fat_chain_add(inode, cluster, 1); if (err) fat_free_clusters(inode, cluster); return err; } static inline int __fat_get_block(struct inode *inode, sector_t iblock, unsigned long *max_blocks, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; sector_t phys, last_block; int err, offset; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { map_bh(bh_result, sb, phys); *max_blocks = min(mapped_blocks, *max_blocks); return 0; } if (!create) return 0; if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)", MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); return -EIO; } last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); /* * allocate a cluster according to the following. * 1) no more available blocks * 2) not part of fallocate region */ if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) return err; } /* available blocks on this cluster */ mapped_blocks = sbi->sec_per_clus - offset; *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (!phys) { fat_fs_error(sb, "invalid FAT chain (i_pos %lld, last_block %llu)", MSDOS_I(inode)->i_pos, (unsigned long long)last_block); return -EIO; } BUG_ON(*max_blocks != mapped_blocks); set_buffer_new(bh_result); map_bh(bh_result, sb, phys); return 0; } static int fat_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; int err; err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create); if (err) return err; bh_result->b_size = max_blocks << sb->s_blocksize_bits; return 0; } static int fat_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, fat_get_block); } static int fat_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, fat_get_block); } static void fat_readahead(struct readahead_control *rac) { mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); fat_truncate_blocks(inode, inode->i_size); } } static int fat_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int err; err = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) fat_write_failed(mapping, pos + len); return err; } static int fat_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); } return err; } static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; ssize_t ret; if (iov_iter_rw(iter) == WRITE) { /* * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), * so we need to update the ->mmu_private to block boundary. * * But we must fill the remaining area or hole by nul for * updating ->mmu_private. * * Return 0, and fallback to normal buffered write. */ loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; } /* * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) fat_write_failed(mapping, offset + count); return ret; } static int fat_get_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; int err; sector_t bmap; unsigned long mapped_blocks; BUG_ON(create != 0); err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); if (err) return err; if (bmap) { map_bh(bh_result, sb, bmap); max_blocks = min(mapped_blocks, max_blocks); } bh_result->b_size = max_blocks << sb->s_blocksize_bits; return 0; } static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; } /* * fat_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This is required during truncate to physically zeroout the tail end * of that block so it doesn't yield old data if the file is later grown. * Also, avoid causing failure from fsx for cases of "data past EOF" */ int fat_block_truncate_page(struct inode *inode, loff_t from) { return block_truncate_page(inode->i_mapping, from, fat_get_block); } static const struct address_space_operations fat_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = fat_read_folio, .readahead = fat_readahead, .writepages = fat_writepages, .write_begin = fat_write_begin, .write_end = fat_write_end, .direct_IO = fat_direct_IO, .bmap = _fat_bmap, .migrate_folio = buffer_migrate_folio, }; /* * New FAT inode stuff. We do the following: * a) i_ino is constant and has nothing with on-disk location. * b) FAT manages its own cache of directory entries. * c) *This* cache is indexed by on-disk location. * d) inode has an associated directory entry, all right, but * it may be unhashed. * e) currently entries are stored within struct inode. That should * change. * f) we deal with races in the following way: * 1. readdir() and lookup() do FAT-dir-cache lookup. * 2. rename() unhashes the F-d-c entry and rehashes it in * a new place. * 3. unlink() and rmdir() unhash F-d-c entry. * 4. fat_write_inode() checks whether the thing is unhashed. * If it is we silently return. If it isn't we do bread(), * check if the location is still valid and retry if it * isn't. Otherwise we do changes. * 5. Spinlock is used to protect hash/unhash/location check/lookup * 6. fat_evict_inode() unhashes the F-d-c entry. * 7. lookup() and readdir() do igrab() if they find a F-d-c entry * and consider negative result as cache miss. */ static void fat_hash_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int i; spin_lock_init(&sbi->inode_hash_lock); for (i = 0; i < FAT_HASH_SIZE; i++) INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); } static inline unsigned long fat_hash(loff_t i_pos) { return hash_32(i_pos, FAT_HASH_BITS); } static void dir_hash_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int i; spin_lock_init(&sbi->dir_hash_lock); for (i = 0; i < FAT_HASH_SIZE; i++) INIT_HLIST_HEAD(&sbi->dir_hashtable[i]); } void fat_attach(struct inode *inode, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); if (inode->i_ino != MSDOS_ROOT_INO) { struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); spin_lock(&sbi->inode_hash_lock); MSDOS_I(inode)->i_pos = i_pos; hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); spin_unlock(&sbi->inode_hash_lock); } /* If NFS support is enabled, cache the mapping of start cluster * to directory inode. This is used during reconnection of * dentries to the filesystem root. */ if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { struct hlist_head *d_head = sbi->dir_hashtable; d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart); spin_lock(&sbi->dir_hash_lock); hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head); spin_unlock(&sbi->dir_hash_lock); } } EXPORT_SYMBOL_GPL(fat_attach); void fat_detach(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); spin_lock(&sbi->inode_hash_lock); MSDOS_I(inode)->i_pos = 0; hlist_del_init(&MSDOS_I(inode)->i_fat_hash); spin_unlock(&sbi->inode_hash_lock); if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { spin_lock(&sbi->dir_hash_lock); hlist_del_init(&MSDOS_I(inode)->i_dir_hash); spin_unlock(&sbi->dir_hash_lock); } } EXPORT_SYMBOL_GPL(fat_detach); struct inode *fat_iget(struct super_block *sb, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); struct msdos_inode_info *i; struct inode *inode = NULL; spin_lock(&sbi->inode_hash_lock); hlist_for_each_entry(i, head, i_fat_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_pos != i_pos) continue; inode = igrab(&i->vfs_inode); if (inode) break; } spin_unlock(&sbi->inode_hash_lock); return inode; } static int is_exec(unsigned char *extension) { unsigned char exe_extensions[] = "EXECOMBAT", *walk; for (walk = exe_extensions; *walk; walk += 3) if (!strncmp(extension, walk, 3)) return 1; return 0; } static int fat_calc_dir_size(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int ret, fclus, dclus; inode->i_size = 0; if (MSDOS_I(inode)->i_start == 0) return 0; ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); if (ret < 0) return ret; inode->i_size = (fclus + 1) << sbi->cluster_bits; return 0; } static int fat_validate_dir(struct inode *dir) { struct super_block *sb = dir->i_sb; if (dir->i_nlink < 2) { /* Directory should have "."/".." entries at least. */ fat_fs_error(sb, "corrupted directory (invalid entries)"); return -EIO; } if (MSDOS_I(dir)->i_start == 0 || MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) { /* Directory should point valid cluster. */ fat_fs_error(sb, "corrupted directory (invalid i_start)"); return -EIO; } return 0; } /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); struct timespec64 mtime; int error; MSDOS_I(inode)->i_pos = 0; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = get_random_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO); inode->i_op = sbi->dir_ops; inode->i_fop = &fat_dir_operations; MSDOS_I(inode)->i_start = fat_get_start(sbi, de); MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; error = fat_calc_dir_size(inode); if (error < 0) return error; MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); error = fat_validate_dir(inode); if (error < 0) return error; } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, ((sbi->options.showexec && !is_exec(de->name + 8)) ? S_IRUGO|S_IWUGO : S_IRWXUGO)); MSDOS_I(inode)->i_start = fat_get_start(sbi, de); MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; inode->i_size = le32_to_cpu(de->size); inode->i_op = &fat_file_inode_operations; inode->i_fop = &fat_file_operations; inode->i_mapping->a_ops = &fat_aops; MSDOS_I(inode)->mmu_private = inode->i_size; } if (de->attr & ATTR_SYS) { if (sbi->options.sys_immutable) inode->i_flags |= S_IMMUTABLE; } fat_save_attrs(inode, de->attr); inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0); inode_set_mtime_to_ts(inode, mtime); inode_set_ctime_to_ts(inode, mtime); if (sbi->options.isvfat) { struct timespec64 atime; fat_time_fat2unix(sbi, &atime, 0, de->adate, 0); inode_set_atime_to_ts(inode, atime); fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, de->cdate, de->ctime_cs); } else inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime)); return 0; } static inline void fat_lock_build_inode(struct msdos_sb_info *sbi) { if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) mutex_lock(&sbi->nfs_build_inode_lock); } static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi) { if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) mutex_unlock(&sbi->nfs_build_inode_lock); } struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos) { struct inode *inode; int err; fat_lock_build_inode(MSDOS_SB(sb)); inode = fat_iget(sb, i_pos); if (inode) goto out; inode = new_inode(sb); if (!inode) { inode = ERR_PTR(-ENOMEM); goto out; } inode->i_ino = iunique(sb, MSDOS_ROOT_INO); inode_set_iversion(inode, 1); err = fat_fill_inode(inode, de); if (err) { iput(inode); inode = ERR_PTR(err); goto out; } fat_attach(inode, i_pos); insert_inode_hash(inode); out: fat_unlock_build_inode(MSDOS_SB(sb)); return inode; } EXPORT_SYMBOL_GPL(fat_build_inode); static int __fat_write_inode(struct inode *inode, int wait); static void fat_free_eofblocks(struct inode *inode) { /* Release unwritten fallocated blocks on inode eviction. */ if ((inode->i_blocks << 9) > round_up(MSDOS_I(inode)->mmu_private, MSDOS_SB(inode->i_sb)->cluster_size)) { int err; fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); /* Fallocate results in updating the i_start/iogstart * for the zero byte file. So, make it return to * original state during evict and commit it to avoid * any corruption on the next access to the cluster * chain for the file. */ err = __fat_write_inode(inode, inode_needs_sync(inode)); if (err) { fat_msg(inode->i_sb, KERN_WARNING, "Failed to " "update on disk inode for unused " "fallocated blocks, inode could be " "corrupted. Please run fsck"); } } } static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); } else fat_free_eofblocks(inode); invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); fat_detach(inode); } static void fat_set_state(struct super_block *sb, unsigned int set, unsigned int force) { struct buffer_head *bh; struct fat_boot_sector *b; struct msdos_sb_info *sbi = MSDOS_SB(sb); /* do not change any thing if mounted read only */ if (sb_rdonly(sb) && !force) return; /* do not change state if fs was dirty */ if (sbi->dirty) { /* warn only on set (mount). */ if (set) fat_msg(sb, KERN_WARNING, "Volume was not properly " "unmounted. Some data may be corrupt. " "Please run fsck."); return; } bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector " "to mark fs as dirty"); return; } b = (struct fat_boot_sector *) bh->b_data; if (is_fat32(sbi)) { if (set) b->fat32.state |= FAT_STATE_DIRTY; else b->fat32.state &= ~FAT_STATE_DIRTY; } else /* fat 16 and 12 */ { if (set) b->fat16.state |= FAT_STATE_DIRTY; else b->fat16.state &= ~FAT_STATE_DIRTY; } mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } static void fat_reset_iocharset(struct fat_mount_options *opts) { if (opts->iocharset != fat_default_iocharset) { /* Note: opts->iocharset can be NULL here */ kfree(opts->iocharset); opts->iocharset = fat_default_iocharset; } } static void delayed_free(struct rcu_head *p) { struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); unload_nls(sbi->nls_disk); unload_nls(sbi->nls_io); fat_reset_iocharset(&sbi->options); kfree(sbi); } static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); fat_set_state(sb, 0, 0); iput(sbi->fsinfo_inode); iput(sbi->fat_inode); call_rcu(&sbi->rcu, delayed_free); } static struct kmem_cache *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; init_rwsem(&ei->truncate_lock); /* Zeroing to allow iput() even if partial initialized inode. */ ei->mmu_private = 0; ei->i_start = 0; ei->i_logstart = 0; ei->i_attrs = 0; ei->i_pos = 0; ei->i_crtime.tv_sec = 0; ei->i_crtime.tv_nsec = 0; return &ei->vfs_inode; } static void fat_free_inode(struct inode *inode) { kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; spin_lock_init(&ei->cache_lru_lock); ei->nr_caches = 0; ei->cache_valid_id = FAT_CACHE_VALID + 1; INIT_LIST_HEAD(&ei->cache_lru); INIT_HLIST_NODE(&ei->i_fat_hash); INIT_HLIST_NODE(&ei->i_dir_hash); inode_init_once(&ei->vfs_inode); } static int __init fat_init_inodecache(void) { fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; return 0; } static void __exit fat_destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(fat_inode_cachep); } int fat_reconfigure(struct fs_context *fc) { bool new_rdonly; struct super_block *sb = fc->root->d_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); fc->sb_flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); sync_filesystem(sb); /* make sure we update state on remount. */ new_rdonly = fc->sb_flags & SB_RDONLY; if (new_rdonly != sb_rdonly(sb)) { if (new_rdonly) fat_set_state(sb, 0, 0); else fat_set_state(sb, 1, 1); } return 0; } EXPORT_SYMBOL_GPL(fat_reconfigure); static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); /* If the count of free cluster is still unknown, counts it here. */ if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { int err = fat_count_free_clusters(dentry->d_sb); if (err) return err; } buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; buf->f_bavail = sbi->free_clusters; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; return 0; } static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *raw_entry; struct timespec64 mtime; loff_t i_pos; sector_t blocknr; int err, offset; if (inode->i_ino == MSDOS_ROOT_INO) return 0; retry: i_pos = fat_i_pos_read(sbi, inode); if (!i_pos) return 0; fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset); bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read inode block " "for updating (i_pos %lld)", i_pos); return -EIO; } spin_lock(&sbi->inode_hash_lock); if (i_pos != MSDOS_I(inode)->i_pos) { spin_unlock(&sbi->inode_hash_lock); brelse(bh); goto retry; } raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset]; if (S_ISDIR(inode->i_mode)) raw_entry->size = 0; else raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); mtime = inode_get_mtime(inode); fat_time_unix2fat(sbi, &mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { struct timespec64 ts = inode_get_atime(inode); __le16 atime; fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL); fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); err = 0; if (wait) err = sync_dirty_buffer(bh); brelse(bh); return err; } static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (inode->i_ino == MSDOS_FSINFO_INO) { struct super_block *sb = inode->i_sb; mutex_lock(&MSDOS_SB(sb)->s_lock); err = fat_clusters_flush(sb); mutex_unlock(&MSDOS_SB(sb)->s_lock); } else err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); return err; } int fat_sync_inode(struct inode *inode) { return __fat_write_inode(inode, 1); } EXPORT_SYMBOL_GPL(fat_sync_inode); static int fat_show_options(struct seq_file *m, struct dentry *root); static const struct super_operations fat_sops = { .alloc_inode = fat_alloc_inode, .free_inode = fat_free_inode, .write_inode = fat_write_inode, .evict_inode = fat_evict_inode, .put_super = fat_put_super, .statfs = fat_statfs, .show_options = fat_show_options, }; static int fat_show_options(struct seq_file *m, struct dentry *root) { struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); struct fat_mount_options *opts = &sbi->options; int isvfat = opts->isvfat; if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, opts->fs_uid)); if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, opts->fs_gid)); seq_printf(m, ",fmask=%04o", opts->fs_fmask); seq_printf(m, ",dmask=%04o", opts->fs_dmask); if (opts->allow_utime) seq_printf(m, ",allow_utime=%04o", opts->allow_utime); if (sbi->nls_disk) /* strip "cp" prefix from displayed option */ seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]); if (isvfat) { if (sbi->nls_io) seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); switch (opts->shortname) { case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95: seq_puts(m, ",shortname=win95"); break; case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT: seq_puts(m, ",shortname=winnt"); break; case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95: seq_puts(m, ",shortname=mixed"); break; case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: seq_puts(m, ",shortname=lower"); break; default: seq_puts(m, ",shortname=unknown"); break; } } if (opts->name_check != 'n') seq_printf(m, ",check=%c", opts->name_check); if (opts->usefree) seq_puts(m, ",usefree"); if (opts->quiet) seq_puts(m, ",quiet"); if (opts->showexec) seq_puts(m, ",showexec"); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (!isvfat) { if (opts->dotsOK) seq_puts(m, ",dotsOK=yes"); if (opts->nocase) seq_puts(m, ",nocase"); } else { if (opts->utf8) seq_puts(m, ",utf8"); if (opts->unicode_xlate) seq_puts(m, ",uni_xlate"); if (!opts->numtail) seq_puts(m, ",nonumtail"); if (opts->rodir) seq_puts(m, ",rodir"); } if (opts->flush) seq_puts(m, ",flush"); if (opts->tz_set) { if (opts->time_offset) seq_printf(m, ",time_offset=%d", opts->time_offset); else seq_puts(m, ",tz=UTC"); } if (opts->errors == FAT_ERRORS_CONT) seq_puts(m, ",errors=continue"); else if (opts->errors == FAT_ERRORS_PANIC) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); if (opts->nfs == FAT_NFS_NOSTALE_RO) seq_puts(m, ",nfs=nostale_ro"); else if (opts->nfs) seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); if (opts->dos1xfloppy) seq_puts(m, ",dos1xfloppy"); return 0; } enum { Opt_check, Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage, Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable, Opt_dots, Opt_dotsOK, Opt_charset, Opt_shortname, Opt_utf8, Opt_utf8_bool, Opt_uni_xl, Opt_uni_xl_bool, Opt_nonumtail, Opt_nonumtail_bool, Opt_obsolete, Opt_flush, Opt_tz, Opt_rodir, Opt_errors, Opt_discard, Opt_nfs, Opt_nfs_enum, Opt_time_offset, Opt_dos1xfloppy, }; static const struct constant_table fat_param_check[] = { {"relaxed", 'r'}, {"r", 'r'}, {"strict", 's'}, {"s", 's'}, {"normal", 'n'}, {"n", 'n'}, {} }; static const struct constant_table fat_param_tz[] = { {"UTC", 0}, {} }; static const struct constant_table fat_param_errors[] = { {"continue", FAT_ERRORS_CONT}, {"panic", FAT_ERRORS_PANIC}, {"remount-ro", FAT_ERRORS_RO}, {} }; static const struct constant_table fat_param_nfs[] = { {"stale_rw", FAT_NFS_STALE_RW}, {"nostale_ro", FAT_NFS_NOSTALE_RO}, {} }; /* * These are all obsolete but we still reject invalid options. * The corresponding values are therefore meaningless. */ static const struct constant_table fat_param_conv[] = { {"binary", 0}, {"text", 0}, {"auto", 0}, {"b", 0}, {"t", 0}, {"a", 0}, {} }; /* Core options. See below for vfat and msdos extras */ const struct fs_parameter_spec fat_param_spec[] = { fsparam_enum ("check", Opt_check, fat_param_check), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_u32oct ("dmask", Opt_dmask), fsparam_u32oct ("fmask", Opt_fmask), fsparam_u32oct ("allow_utime", Opt_allow_utime), fsparam_u32 ("codepage", Opt_codepage), fsparam_flag ("usefree", Opt_usefree), fsparam_flag ("nocase", Opt_nocase), fsparam_flag ("quiet", Opt_quiet), fsparam_flag ("showexec", Opt_showexec), fsparam_flag ("debug", Opt_debug), fsparam_flag ("sys_immutable", Opt_immutable), fsparam_flag ("flush", Opt_flush), fsparam_enum ("tz", Opt_tz, fat_param_tz), fsparam_s32 ("time_offset", Opt_time_offset), fsparam_enum ("errors", Opt_errors, fat_param_errors), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nfs", Opt_nfs), fsparam_enum ("nfs", Opt_nfs_enum, fat_param_nfs), fsparam_flag ("dos1xfloppy", Opt_dos1xfloppy), __fsparam(fs_param_is_enum, "conv", Opt_obsolete, fs_param_deprecated, fat_param_conv), __fsparam(fs_param_is_u32, "fat", Opt_obsolete, fs_param_deprecated, NULL), __fsparam(fs_param_is_u32, "blocksize", Opt_obsolete, fs_param_deprecated, NULL), __fsparam(fs_param_is_string, "cvf_format", Opt_obsolete, fs_param_deprecated, NULL), __fsparam(fs_param_is_string, "cvf_options", Opt_obsolete, fs_param_deprecated, NULL), __fsparam(NULL, "posix", Opt_obsolete, fs_param_deprecated, NULL), {} }; EXPORT_SYMBOL_GPL(fat_param_spec); static const struct fs_parameter_spec msdos_param_spec[] = { fsparam_flag_no ("dots", Opt_dots), fsparam_bool ("dotsOK", Opt_dotsOK), {} }; static const struct constant_table fat_param_shortname[] = { {"lower", VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95}, {"win95", VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95}, {"winnt", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT}, {"mixed", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95}, {} }; static const struct fs_parameter_spec vfat_param_spec[] = { fsparam_string ("iocharset", Opt_charset), fsparam_enum ("shortname", Opt_shortname, fat_param_shortname), fsparam_flag ("utf8", Opt_utf8), fsparam_bool ("utf8", Opt_utf8_bool), fsparam_flag ("uni_xlate", Opt_uni_xl), fsparam_bool ("uni_xlate", Opt_uni_xl_bool), fsparam_flag ("nonumtail", Opt_nonumtail), fsparam_bool ("nonumtail", Opt_nonumtail_bool), fsparam_flag ("rodir", Opt_rodir), {} }; int fat_parse_param(struct fs_context *fc, struct fs_parameter *param, bool is_vfat) { struct fat_mount_options *opts = fc->fs_private; struct fs_parse_result result; int opt; /* remount options have traditionally been ignored */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) return 0; opt = fs_parse(fc, fat_param_spec, param, &result); /* If option not found in fat_param_spec, try vfat/msdos options */ if (opt == -ENOPARAM) { if (is_vfat) opt = fs_parse(fc, vfat_param_spec, param, &result); else opt = fs_parse(fc, msdos_param_spec, param, &result); } if (opt < 0) return opt; switch (opt) { case Opt_check: opts->name_check = result.uint_32; break; case Opt_usefree: opts->usefree = 1; break; case Opt_nocase: if (!is_vfat) opts->nocase = 1; else { /* for backward compatibility */ opts->shortname = VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95; } break; case Opt_quiet: opts->quiet = 1; break; case Opt_showexec: opts->showexec = 1; break; case Opt_debug: opts->debug = 1; break; case Opt_immutable: opts->sys_immutable = 1; break; case Opt_uid: opts->fs_uid = result.uid; break; case Opt_gid: opts->fs_gid = result.gid; break; case Opt_umask: opts->fs_fmask = opts->fs_dmask = result.uint_32; break; case Opt_dmask: opts->fs_dmask = result.uint_32; break; case Opt_fmask: opts->fs_fmask = result.uint_32; break; case Opt_allow_utime: opts->allow_utime = result.uint_32 & (S_IWGRP | S_IWOTH); break; case Opt_codepage: opts->codepage = result.uint_32; break; case Opt_flush: opts->flush = 1; break; case Opt_time_offset: /* * GMT+-12 zones may have DST corrections so at least * 13 hours difference is needed. Make the limit 24 * just in case someone invents something unusual. */ if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60) return -EINVAL; opts->tz_set = 1; opts->time_offset = result.int_32; break; case Opt_tz: opts->tz_set = 1; opts->time_offset = result.uint_32; break; case Opt_errors: opts->errors = result.uint_32; break; case Opt_nfs: opts->nfs = FAT_NFS_STALE_RW; break; case Opt_nfs_enum: opts->nfs = result.uint_32; break; case Opt_dos1xfloppy: opts->dos1xfloppy = 1; break; /* msdos specific */ case Opt_dots: /* dots / nodots */ opts->dotsOK = !result.negated; break; case Opt_dotsOK: /* dotsOK = yes/no */ opts->dotsOK = result.boolean; break; /* vfat specific */ case Opt_charset: fat_reset_iocharset(opts); opts->iocharset = param->string; param->string = NULL; /* Steal string */ break; case Opt_shortname: opts->shortname = result.uint_32; break; case Opt_utf8: opts->utf8 = 1; break; case Opt_utf8_bool: opts->utf8 = result.boolean; break; case Opt_uni_xl: opts->unicode_xlate = 1; break; case Opt_uni_xl_bool: opts->unicode_xlate = result.boolean; break; case Opt_nonumtail: opts->numtail = 0; /* negated option */ break; case Opt_nonumtail_bool: opts->numtail = !result.boolean; /* negated option */ break; case Opt_rodir: opts->rodir = 1; break; case Opt_discard: opts->discard = 1; break; /* obsolete mount options */ case Opt_obsolete: printk(KERN_INFO "FAT-fs: \"%s\" option is obsolete, " "not supported now", param->key); break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(fat_parse_param); static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = 0; inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO); inode->i_op = sbi->dir_ops; inode->i_fop = &fat_dir_operations; if (is_fat32(sbi)) { MSDOS_I(inode)->i_start = sbi->root_cluster; error = fat_calc_dir_size(inode); if (error < 0) return error; } else { MSDOS_I(inode)->i_start = 0; inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry); } inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; MSDOS_I(inode)->i_logstart = 0; MSDOS_I(inode)->mmu_private = inode->i_size; fat_save_attrs(inode, ATTR_DIR); inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0))); set_nlink(inode, fat_subdirs(inode)+2); return 0; } static unsigned long calc_fat_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); /* Divide first to avoid overflow */ if (!is_fat12(sbi)) { unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; return ent_per_sec * sbi->fat_length; } return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; } static bool fat_bpb_is_zero(struct fat_boot_sector *b) { if (get_unaligned_le16(&b->sector_size)) return false; if (b->sec_per_clus) return false; if (b->reserved) return false; if (b->fats) return false; if (get_unaligned_le16(&b->dir_entries)) return false; if (get_unaligned_le16(&b->sectors)) return false; if (b->media) return false; if (b->fat_length) return false; if (b->secs_track) return false; if (b->heads) return false; return true; } static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, int silent, struct fat_bios_param_block *bpb) { int error = -EINVAL; /* Read in BPB ... */ memset(bpb, 0, sizeof(*bpb)); bpb->fat_sector_size = get_unaligned_le16(&b->sector_size); bpb->fat_sec_per_clus = b->sec_per_clus; bpb->fat_reserved = le16_to_cpu(b->reserved); bpb->fat_fats = b->fats; bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries); bpb->fat_sectors = get_unaligned_le16(&b->sectors); bpb->fat_fat_length = le16_to_cpu(b->fat_length); bpb->fat_total_sect = le32_to_cpu(b->total_sect); bpb->fat16_state = b->fat16.state; bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id); bpb->fat32_length = le32_to_cpu(b->fat32.length); bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster); bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector); bpb->fat32_state = b->fat32.state; bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id); /* Validate this looks like a FAT filesystem BPB */ if (!bpb->fat_reserved) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); goto out; } if (!bpb->fat_fats) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); goto out; } /* * Earlier we checked here that b->secs_track and b->head are nonzero, * but it turns out valid FAT filesystems can have zero there. */ if (!fat_valid_media(b->media)) { if (!silent) fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", (unsigned)b->media); goto out; } if (!is_power_of_2(bpb->fat_sector_size) || (bpb->fat_sector_size < 512) || (bpb->fat_sector_size > 4096)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus logical sector size %u", (unsigned)bpb->fat_sector_size); goto out; } if (!is_power_of_2(bpb->fat_sec_per_clus)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", (unsigned)bpb->fat_sec_per_clus); goto out; } if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of FAT sectors"); goto out; } error = 0; out: return error; } static int fat_read_static_bpb(struct super_block *sb, struct fat_boot_sector *b, int silent, struct fat_bios_param_block *bpb) { static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; sector_t bd_sects = bdev_nr_sectors(sb->s_bdev); struct fat_floppy_defaults *fdefaults = NULL; int error = -EINVAL; unsigned i; /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { if (!silent) fat_msg(sb, KERN_ERR, "%s; no bootstrapping code", notdos1x); goto out; } /* * If any value in this region is non-zero, it isn't archaic * DOS. */ if (!fat_bpb_is_zero(b)) { if (!silent) fat_msg(sb, KERN_ERR, "%s; DOS 2.x BPB is non-zero", notdos1x); goto out; } for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) { if (floppy_defaults[i].nr_sectors == bd_sects) { fdefaults = &floppy_defaults[i]; break; } } if (fdefaults == NULL) { if (!silent) fat_msg(sb, KERN_WARNING, "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)", (u64)bd_sects); goto out; } if (!silent) fat_msg(sb, KERN_INFO, "This looks like a DOS 1.x volume; assuming default BPB values"); memset(bpb, 0, sizeof(*bpb)); bpb->fat_sector_size = SECTOR_SIZE; bpb->fat_sec_per_clus = fdefaults->sec_per_clus; bpb->fat_reserved = 1; bpb->fat_fats = 2; bpb->fat_dir_entries = fdefaults->dir_entries; bpb->fat_sectors = fdefaults->nr_sectors; bpb->fat_fat_length = fdefaults->fat_length; error = 0; out: return error; } /* * Read the super block of an MS-DOS FS. */ int fat_fill_super(struct super_block *sb, struct fs_context *fc, void (*setup)(struct super_block *)) { struct fat_mount_options *opts = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; struct inode *root_inode = NULL, *fat_inode = NULL; struct inode *fsinfo_inode = NULL; struct buffer_head *bh; struct fat_bios_param_block bpb; struct msdos_sb_info *sbi; u16 logical_sector_size; u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; long error; char buf[50]; struct timespec64 ts; /* * GFP_KERNEL is ok here, because while we do hold the * superblock lock, memory pressure can't call back into * the filesystem, since we're only just about to mount * it and have no inodes etc active! */ sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; sb->s_flags |= SB_NODIRATIME; sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; /* * fat timestamps are complex and truncated by fat itself, so * we set 1 here to be fast */ sb->s_time_gran = 1; mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be" " case sensitive!"); } /* If user doesn't specify allow_utime, it's initialized from dmask. */ if (opts->allow_utime == (unsigned short)-1) opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); if (opts->unicode_xlate) opts->utf8 = 0; if (opts->nfs == FAT_NFS_NOSTALE_RO) { sb->s_flags |= SB_RDONLY; sb->s_export_op = &fat_export_ops_nostale; } /* Apply parsed options to sbi (structure copy) */ sbi->options = *opts; /* Transfer ownership of iocharset to sbi->options */ opts->iocharset = NULL; setup(sb); /* flavour-specific stuff that needs options */ error = -EIO; sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); goto out_fail; } error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, &bpb); if (error == -EINVAL && sbi->options.dos1xfloppy) error = fat_read_static_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, &bpb); brelse(bh); if (error == -EINVAL) goto out_invalid; else if (error) goto out_fail; logical_sector_size = bpb.fat_sector_size; sbi->sec_per_clus = bpb.fat_sec_per_clus; error = -EIO; if (logical_sector_size < sb->s_blocksize) { fat_msg(sb, KERN_ERR, "logical sector size too small for device" " (logical sector size = %u)", logical_sector_size); goto out_fail; } if (logical_sector_size > sb->s_blocksize) { struct buffer_head *bh_resize; if (!sb_set_blocksize(sb, logical_sector_size)) { fat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sector_size); goto out_fail; } /* Verify that the larger boot sector is fully readable */ bh_resize = sb_bread(sb, 0); if (bh_resize == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector" " (logical sector size = %lu)", sb->s_blocksize); goto out_fail; } brelse(bh_resize); } mutex_init(&sbi->s_lock); sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; sbi->cluster_bits = ffs(sbi->cluster_size) - 1; sbi->fats = bpb.fat_fats; sbi->fat_bits = 0; /* Don't know yet */ sbi->fat_start = bpb.fat_reserved; sbi->fat_length = bpb.fat_fat_length; sbi->root_cluster = 0; sbi->free_clusters = -1; /* Don't know yet */ sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; fat_time_fat2unix(sbi, &ts, 0, cpu_to_le16(FAT_DATE_MIN), 0); sb->s_time_min = ts.tv_sec; fat_time_fat2unix(sbi, &ts, cpu_to_le16(FAT_TIME_MAX), cpu_to_le16(FAT_DATE_MAX), 0); sb->s_time_max = ts.tv_sec; if (!sbi->fat_length && bpb.fat32_length) { struct fat_boot_fsinfo *fsinfo; struct buffer_head *fsinfo_bh; /* Must be FAT32 */ sbi->fat_bits = 32; sbi->fat_length = bpb.fat32_length; sbi->root_cluster = bpb.fat32_root_cluster; /* MC - if info_sector is 0, don't multiply by 0 */ sbi->fsinfo_sector = bpb.fat32_info_sector; if (sbi->fsinfo_sector == 0) sbi->fsinfo_sector = 1; fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); if (fsinfo_bh == NULL) { fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" " (sector = %lu)", sbi->fsinfo_sector); goto out_fail; } fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; if (!IS_FSINFO(fsinfo)) { fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: " "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); } else { if (sbi->options.usefree) sbi->free_clus_valid = 1; sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters); sbi->prev_free = le32_to_cpu(fsinfo->next_cluster); } brelse(fsinfo_bh); } /* interpret volume ID as a little endian 32 bit integer */ if (is_fat32(sbi)) sbi->vol_id = bpb.fat32_vol_id; else /* fat 16 or 12 */ sbi->vol_id = bpb.fat16_vol_id; __le32 vol_id_le = cpu_to_le32(sbi->vol_id); super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le)); sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; sbi->dir_entries = bpb.fat_dir_entries; if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of directory entries" " (%u)", sbi->dir_entries); goto out_invalid; } rootdir_sectors = sbi->dir_entries * sizeof(struct msdos_dir_entry) / sb->s_blocksize; sbi->data_start = sbi->dir_start + rootdir_sectors; total_sectors = bpb.fat_sectors; if (total_sectors == 0) total_sectors = bpb.fat_total_sect; total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; if (!is_fat32(sbi)) sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ if (is_fat32(sbi)) sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY; else /* fat 16 or 12 */ sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > max_fat(sb)) { if (!silent) fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); goto out_invalid; } sbi->max_cluster = total_clusters + FAT_START_ENT; /* check the free_clusters, it's not necessarily correct */ if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters) sbi->free_clusters = -1; /* check the prev_free, it's not necessarily correct */ sbi->prev_free %= sbi->max_cluster; if (sbi->prev_free < FAT_START_ENT) sbi->prev_free = FAT_START_ENT; /* set up enough so that it can read an inode */ fat_hash_init(sb); dir_hash_init(sb); fat_ent_access_init(sb); /* * The low byte of the first FAT entry must have the same value as * the media field of the boot sector. But in real world, too many * devices are writing wrong values. So, removed that validity check. * * The removed check compared the first FAT entry to a value dependent * on the media field like this: * == (0x0F00 | media), for FAT12 * == (0XFF00 | media), for FAT16 * == (0x0FFFFF | media), for FAT32 */ error = -EINVAL; sprintf(buf, "cp%d", sbi->options.codepage); sbi->nls_disk = load_nls(buf); if (!sbi->nls_disk) { fat_msg(sb, KERN_ERR, "codepage %s not found", buf); goto out_fail; } /* FIXME: utf8 is using iocharset for upper/lower conversion */ if (sbi->options.isvfat) { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { fat_msg(sb, KERN_ERR, "IO charset %s not found", sbi->options.iocharset); goto out_fail; } } error = -ENOMEM; fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); root_inode = new_inode(sb); if (!root_inode) goto out_fail; root_inode->i_ino = MSDOS_ROOT_INO; inode_set_iversion(root_inode, 1); error = fat_read_root(root_inode); if (error < 0) { iput(root_inode); goto out_fail; } error = -ENOMEM; insert_inode_hash(root_inode); fat_attach(root_inode, 0); sb->s_root = d_make_root(root_inode); if (!sb->s_root) { fat_msg(sb, KERN_ERR, "get root inode failed"); goto out_fail; } if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev)) fat_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); fat_set_state(sb, 1, 0); return 0; out_invalid: error = -EINVAL; if (!silent) fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: iput(fsinfo_inode); iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); fat_reset_iocharset(&sbi->options); sb->s_fs_info = NULL; kfree(sbi); return error; } EXPORT_SYMBOL_GPL(fat_fill_super); /* * helper function for fat_flush_inodes. This writes both the inode * and the file data blocks, waiting for in flight data blocks before * the start of the call. It does not wait for any io started * during the call */ static int writeback_inode(struct inode *inode) { int ret; /* if we used wait=1, sync_inode_metadata waits for the io for the * inode to finish. So wait=0 is sent down to sync_inode_metadata * and filemap_fdatawrite is used for the data blocks */ ret = sync_inode_metadata(inode, 0); if (!ret) ret = filemap_fdatawrite(inode->i_mapping); return ret; } /* * write data and metadata corresponding to i1 and i2. The io is * started but we do not wait for any of it to finish. * * filemap_flush is used for the block device, so if there is a dirty * page for a block already in flight, we will not wait and start the * io over again */ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) { int ret = 0; if (!MSDOS_SB(sb)->options.flush) return 0; if (i1) ret = writeback_inode(i1); if (!ret && i2) ret = writeback_inode(i2); if (!ret) ret = sync_blockdev_nowait(sb->s_bdev); return ret; } EXPORT_SYMBOL_GPL(fat_flush_inodes); int fat_init_fs_context(struct fs_context *fc, bool is_vfat) { struct fat_mount_options *opts; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; opts->isvfat = is_vfat; opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); opts->fs_fmask = opts->fs_dmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; fat_reset_iocharset(opts); if (is_vfat) { opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; } else { opts->shortname = 0; opts->rodir = 1; } opts->name_check = 'n'; opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; opts->tz_set = 0; opts->nfs = 0; opts->errors = FAT_ERRORS_RO; opts->debug = 0; opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat; fc->fs_private = opts; /* fc->ops assigned by caller */ return 0; } EXPORT_SYMBOL_GPL(fat_init_fs_context); void fat_free_fc(struct fs_context *fc) { struct fat_mount_options *opts = fc->fs_private; if (opts->iocharset != fat_default_iocharset) kfree(opts->iocharset); kfree(fc->fs_private); } EXPORT_SYMBOL_GPL(fat_free_fc); static int __init init_fat_fs(void) { int err; err = fat_cache_init(); if (err) return err; err = fat_init_inodecache(); if (err) goto failed; return 0; failed: fat_cache_destroy(); return err; } static void __exit exit_fat_fs(void) { fat_cache_destroy(); fat_destroy_inodecache(); } module_init(init_fat_fs) module_exit(exit_fat_fs) MODULE_DESCRIPTION("Core FAT filesystem support"); MODULE_LICENSE("GPL");
1 1 5 6 6 4 2 2 4 2 2 1 1 1 1 1 1 8 3 8 8 1 1 1 2 2 2 2 1 1 1 9 5 5 5 1 1 1 1 1 4 4 1 1 1 1 4 4 4 1 1 1 1 1 1 1 2 2 1 1 2 1 1 8 8 8 8 4 54 1 1 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 12 1 62 62 55 55 54 55 54 54 2 2 2 19 1 1 2 2 2 2 2 384 384 381 3 3 2 2 2 2 2 1 2 1 378 378 19 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 5 1 38 20 1 1 1 1 1 1 1 1 3 1 2 1 1 2 1 1 8 1 1 1 1 1 1 11 1 1 1 1 1 1 1 4 5 8 8 1 1 1 4 1 3 40 1 39 39 19 19 19 39 19 18 19 19 11 11 11 8 33 32 1 1 1 1 1 1 1 1 1 2 1 1 2 1 2 10 19 19 1 1 2 2 1 1 3 3 3 2 3 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 // SPDX-License-Identifier: GPL-2.0-only /* * KVM Microsoft Hyper-V emulation * * derived from arch/x86/kvm/x86.c * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> #include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> #include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" #include "irq.h" #include "fpu.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) /* * As per Hyper-V TLFS, extended hypercalls start from 0x8001 * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value * where each bit tells which extended hypercall is available besides * HvExtCallQueryCapabilities. * * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit * assigned. * * 0x8002 - Bit 0 * 0x8003 - Bit 1 * .. * 0x8041 - Bit 63 * * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 */ #define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); } static inline int synic_get_sint_vector(u64 sint_value) { if (sint_value & HV_SYNIC_SINT_MASKED) return -1; return sint_value & HV_SYNIC_SINT_VECTOR_MASK; } static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, int vector) { int i; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) return true; } return false; } static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, int vector) { int i; u64 sint_value; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { sint_value = synic_read_sint(synic, i); if (synic_get_sint_vector(sint_value) == vector && sint_value & HV_SYNIC_SINT_AUTO_EOI) return true; } return false; } static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); bool auto_eoi_old, auto_eoi_new; if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; if (synic_has_vector_connected(synic, vector)) __set_bit(vector, synic->vec_bitmap); else __clear_bit(vector, synic->vec_bitmap); auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (auto_eoi_old == auto_eoi_new) return; if (!enable_apicv) return; down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; else hv->synic_auto_eoi_used--; /* * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on * the hypervisor to manually inject IRQs. */ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_HYPERV, !!hv->synic_auto_eoi_used); up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, u64 data, bool host) { int vector, old_vector; bool masked; vector = data & HV_SYNIC_SINT_VECTOR_MASK; masked = data & HV_SYNIC_SINT_MASKED; /* * Valid vectors are 16-255, however, nested Hyper-V attempts to write * default '0x10000' value on boot and this should not #GP. We need to * allow zero-initing the register from host as well. */ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked) return 1; /* * Guest may configure multiple SINTs to use the same vector, so * we maintain a bitmap of vectors handled by synic, and a * bitmap of vectors with auto-eoi behavior. The bitmaps are * updated here, and atomically queried on fast paths. */ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK; atomic64_set(&synic->sint[sint], data); synic_update_vector(synic, old_vector); synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); /* Try to deliver pending Hyper-V SynIC timers messages */ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; if (stimer->msg_pending && stimer->config.enable && !stimer->config.direct_mode && stimer->config.sintx == sint) stimer_mark_pending(stimer, false); } idx = srcu_read_lock(&kvm->irq_srcu); gsi = atomic_read(&synic->sint_to_gsi[sint]); if (gsi != -1) kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; hv_vcpu->exit.u.synic.control = synic->control; hv_vcpu->exit.u.synic.evt_page = synic->evt_page; hv_vcpu->exit.u.synic.msg_page = synic->msg_page; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: synic->control = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SVERSION: if (!host) { ret = 1; break; } synic->version = data; break; case HV_X64_MSR_SIEFP: if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->evt_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: if ((data & HV_SYNIC_SIMP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->msg_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_EOM: { int i; if (!synic->active) break; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; } case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); break; default: ret = 1; break; } return ret; } static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); return hv_vcpu->cpuid_cache.syndbg_cap_eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; } static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) hv->hv_syndbg.control.status = vcpu->run->hyperv.u.syndbg.status; return 1; } static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; hv_vcpu->exit.u.syndbg.msr = msr; hv_vcpu->exit.u.syndbg.control = syndbg->control.control; hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; vcpu->arch.complete_userspace_io = kvm_hv_syndbg_complete_userspace; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, to_hv_vcpu(vcpu)->vp_index, msr, data); switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: syndbg->control.control = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_STATUS: syndbg->control.status = data; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: syndbg->control.send_page = data; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: syndbg->control.recv_page = data; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: syndbg->control.pending_page = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_OPTIONS: syndbg->options = data; break; default: break; } return 0; } static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: *pdata = syndbg->control.control; break; case HV_X64_MSR_SYNDBG_STATUS: *pdata = syndbg->control.status; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: *pdata = syndbg->control.send_page; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: *pdata = syndbg->control.recv_page; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: *pdata = syndbg->control.pending_page; break; case HV_X64_MSR_SYNDBG_OPTIONS: *pdata = syndbg->options; break; default: break; } trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); return 0; } static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { int ret; if (!synic->active && !host) return 1; ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: *pdata = synic->control; break; case HV_X64_MSR_SVERSION: *pdata = synic->version; break; case HV_X64_MSR_SIEFP: *pdata = synic->evt_page; break; case HV_X64_MSR_SIMP: *pdata = synic->msg_page; break; case HV_X64_MSR_EOM: *pdata = 0; break; case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); break; default: ret = 1; break; } return ret; } static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; vector = synic_get_sint_vector(synic_read_sint(synic, sint)); if (vector < 0) return -ENOENT; memset(&irq, 0, sizeof(irq)); irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; if (!level) return -1; synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); for (i = 0; i < ARRAY_SIZE(synic->sint); i++) if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) kvm_hv_notify_acked_sint(vcpu, i); } static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) return -EINVAL; atomic_set(&synic->sint_to_gsi[sint], gsi); return 0; } void kvm_hv_irq_routing_update(struct kvm *kvm) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; u32 gsi; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { if (e->type == KVM_IRQ_ROUTING_HV_SINT) kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, e->hv_sint.sint, gsi); } } } static void synic_init(struct kvm_vcpu_hv_synic *synic) { int i; memset(synic, 0, sizeof(*synic)); synic->version = HV_SYNIC_VERSION_1; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); atomic_set(&synic->sint_to_gsi[i], -1); } } static u64 get_time_ref_counter(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; /* * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, * is broken, disabled or being updated. */ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); tsc = kvm_read_l1_tsc(vcpu, rdtsc()); return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); } static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) { struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); return HRTIMER_NORESTART; } /* * stimer_start() assumptions: * a) stimer->count is not equal to 0 * b) stimer->config has HV_STIMER_ENABLE flag */ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) { u64 time_now; ktime_t ktime_now; time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { if (stimer->exp_time) { if (time_now >= stimer->exp_time) { u64 remainder; div64_u64_rem(time_now - stimer->exp_time, stimer->count, &remainder); stimer->exp_time = time_now + (stimer->count - remainder); } } else stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->exp_time - time_now)), HRTIMER_MODE_ABS); return 0; } stimer->exp_time = stimer->count; if (time_now >= stimer->count) { /* * Expire timer according to Hypervisor Top-Level Functional * specification v4(15.3.1): * "If a one shot is enabled and the specified count is in * the past, it will expire immediately." */ stimer_mark_pending(stimer, false); return 0; } trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), HRTIMER_MODE_ABS); return 0; } static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || config)) return 1; if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && !(hv_vcpu->cpuid_cache.features_edx & HV_STIMER_DIRECT_MODE_AVAILABLE))) return 1; trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); if (old_config.enable && !new_config.direct_mode && new_config.sintx == 0) new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || count)) return 1; trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); stimer->count = count; if (!host) { if (stimer->count == 0) stimer->config.enable = 0; else if (stimer->config.auto_enable) stimer->config.enable = 1; } if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) { *pconfig = stimer->config.as_uint64; return 0; } static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) { *pcount = stimer->count; return 0; } static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; int r; if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) return -ENOENT; msg_page_gfn = synic->msg_page >> PAGE_SHIFT; /* * Strictly following the spec-mandated ordering would assume setting * .msg_pending before checking .message_type. However, this function * is only called in vcpu context so the entire update is atomic from * guest POV and thus the exact order here doesn't matter. */ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, msg_off + offsetof(struct hv_message, header.message_type), sizeof(hv_hdr.message_type)); if (r < 0) return r; if (hv_hdr.message_type != HVMSG_NONE) { if (no_retry) return 0; hv_hdr.message_flags.msg_pending = 1; r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_flags, msg_off + offsetof(struct hv_message, header.message_flags), sizeof(hv_hdr.message_flags)); if (r < 0) return r; return -EAGAIN; } r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, sizeof(src_msg->header) + src_msg->header.payload_size); if (r < 0) return r; r = synic_set_irq(synic, sint); if (r < 0) return r; if (r == 0) return -EFAULT; return 0; } static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; /* * To avoid piling up periodic ticks, don't retry message * delivery for them (within "lazy" lost ticks policy). */ bool no_retry = stimer->config.periodic; payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector }; if (lapic_in_kernel(vcpu)) return !kvm_apic_set_irq(vcpu, &irq, NULL); return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) { int r, direct = stimer->config.direct_mode; stimer->msg_pending = true; if (!direct) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; if (!(stimer->config.periodic)) stimer->config.enable = 0; } } void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; if (stimer->config.enable) { exp_time = stimer->exp_time; if (exp_time) { time_now = get_time_ref_counter(vcpu->kvm); if (time_now >= exp_time) stimer_expiration(stimer); } if ((stimer->config.enable) && stimer->count) { if (!stimer->msg_pending) stimer_start(stimer); } else stimer_cleanup(stimer); } } } void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); kfree(hv_vcpu); vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu) return false; if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) return -EFAULT; return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; memset(&msg->header, 0, sizeof(msg->header)); msg->header.message_type = HVMSG_TIMER_EXPIRED; msg->header.payload_size = sizeof(*payload); payload->timer_index = stimer->index; payload->expiration_time = 0; payload->delivery_time = 0; } static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (hv_vcpu) return 0; hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); if (!hv_vcpu) return -ENOMEM; vcpu->arch.hyperv = hv_vcpu; hv_vcpu->vcpu = vcpu; synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); hv_vcpu->vp_index = vcpu->vcpu_idx; for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); } return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { struct kvm_vcpu_hv_synic *synic; int r; r = kvm_hv_vcpu_init(vcpu); if (r) return r; synic = to_hv_synic(vcpu); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; return 0; } static bool kvm_hv_msr_partition_wide(u32 msr) { bool r = false; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: case HV_X64_MSR_REFERENCE_TSC: case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_RESET: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } return r; } static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } /* * The kvmclock and Hyper-V TSC page use similar formulas, and converting * between them is possible: * * kvmclock formula: * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * * Hyper-V formula: * nsec/100 = ticks * scale / 2^64 + offset * * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. * By dividing the kvmclock formula by 100 and equating what's left we get: * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 * * Now expand the kvmclock formula and divide by 100: * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * + system_time / 100 * * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: * nsec/100 = ticks * scale / 2^64 * - tsc_timestamp * scale / 2^64 * + system_time / 100 * * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: * offset = system_time / 100 - tsc_timestamp * scale / 2^64 * * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) return false; /* * check if scale would overflow, if so we use the time ref counter * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) */ max_mul = 100ull << (32 - hv_clock->tsc_shift); if (hv_clock->tsc_to_system_mul >= max_mul) return false; /* * Otherwise compute the scale and offset according to the formulas * derived above. */ tsc_ref->tsc_scale = mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), hv_clock->tsc_to_system_mul, 100); tsc_ref->tsc_offset = hv_clock->system_time; do_div(tsc_ref->tsc_offset, 100); tsc_ref->tsc_offset -= mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); return true; } /* * Don't touch TSC page values if the guest has opted for TSC emulation after * migration. KVM doesn't fully support reenlightenment notifications and TSC * access emulation and Hyper-V is known to expect the values in TSC page to * stay constant before TSC access emulation is disabled from guest side * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC * frequency and guest visible TSC value across migration (and prevent it when * TSC scaling is unsupported). */ static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) { return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && hv->hv_tsc_emulation_control; } void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); guard(mutex)(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a * change in the master clock, do not bother with caching. */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) goto out_err; if (tsc_seq && tsc_page_update_unsafe(hv)) { if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; return; } /* * While we're computing and writing the parameters, force the * guest to use the time reference count MSR. */ hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. */ tsc_seq++; if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) tsc_seq = 1; /* Write the struct entirely before the non-zero sequence. */ smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; return; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; } void kvm_hv_request_tsc_page_update(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_lock(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET && !tsc_page_update_unsafe(hv)) hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; mutex_unlock(&hv->hv_lock); } static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) { if (!hv_vcpu->enforce_cpuid) return true; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_HYPERCALL_AVAILABLE; case HV_X64_MSR_VP_RUNTIME: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_RUNTIME_AVAILABLE; case HV_X64_MSR_TIME_REF_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_TIME_REF_COUNT_AVAILABLE; case HV_X64_MSR_VP_INDEX: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_INDEX_AVAILABLE; case HV_X64_MSR_RESET: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_RESET_AVAILABLE; case HV_X64_MSR_REFERENCE_TSC: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_REFERENCE_TSC_AVAILABLE; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNIC_AVAILABLE; case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNTIMER_AVAILABLE; case HV_X64_MSR_EOI: case HV_X64_MSR_ICR: case HV_X64_MSR_TPR: case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_FREQUENCY_MSRS; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; case HV_X64_MSR_TSC_INVARIANT_CONTROL: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_DEBUG_MSRS_AVAILABLE; default: break; } return false; } #define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839 #define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */ /* * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC * configuration. * Such configuration can result from, for example, AMD Erratum 1386 workaround. * * Print a notice so users aren't left wondering what's suddenly gone wrong. */ static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); /* Check again under the hv_lock. */ if (hv->xsaves_xsavec_checked) return; if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) != KVM_HV_WIN2016_GUEST_ID) return; hv->xsaves_xsavec_checked = true; /* UP configurations aren't affected */ if (atomic_read(&kvm->online_vcpus) < 2) return; if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) return; pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " "If it fails to boot try disabling XSAVEC in the VM config.\n"); } void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!vcpu->arch.hyperv_enabled || hv->xsaves_xsavec_checked) return; mutex_lock(&hv->hv_lock); __kvm_hv_xsaves_xsavec_maybe_warn(vcpu); mutex_unlock(&hv->hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; /* setting guest os id to zero disables hypercall page */ if (!hv->hv_guest_os_id) hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { u8 instructions[9]; int i = 0; u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) break; if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { hv->hv_hypercall = data; break; } /* * If Xen and Hyper-V hypercalls are both enabled, disambiguate * the same way Xen itself does, by setting the bit 31 of EAX * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just * going to be clobbered on 64-bit. */ if (kvm_xen_hypercall_enabled(kvm)) { /* orl $0x80000000, %eax */ instructions[i++] = 0x0d; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x80; } /* vmcall/vmmcall */ kvm_x86_call(patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ ((unsigned char *)instructions)[i++] = 0xc3; addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; break; } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { if (!host) hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; else hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); } else { hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: if (host) return kvm_hv_msr_set_crash_ctl(kvm, data); if (data & HV_CRASH_CTL_CRASH_NOTIFY) { vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", hv->hv_crash_param[0], hv->hv_crash_param[1], hv->hv_crash_param[2], hv->hv_crash_param[3], hv->hv_crash_param[4]); /* Send notification about crash to user space */ kvm_make_request(KVM_REQ_HV_CRASH, vcpu); } break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); kvm_make_request(KVM_REQ_HV_RESET, vcpu); } break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: hv->hv_reenlightenment_control = data; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: if (data && !host) return 1; hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: /* Only bit 0 is supported */ if (data & ~HV_EXPOSE_INVARIANT_TSC) return 1; /* The feature can't be disabled from the guest */ if (!host && hv->hv_invtsc_control && !data) return 1; hv->hv_invtsc_control = data; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } /* Calculate cpu time spent by current task in 100ns units */ static u64 current_task_runtime_100ns(void) { u64 utime, stime; task_cputime_adjusted(current, &utime, &stime); return div_u64(utime + stime, 100); } static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; if (new_vp_index == hv_vcpu->vp_index) return 0; /* * The VP index is initialized to vcpu_index by * kvm_hv_vcpu_postcreate so they initially match. Now the * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; break; } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; /* * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ if (__put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; break; } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: data = hv->hv_guest_os_id; break; case HV_X64_MSR_HYPERCALL: data = hv->hv_hypercall; break; case HV_X64_MSR_TIME_REF_COUNT: data = get_time_ref_counter(kvm); break; case HV_X64_MSR_REFERENCE_TSC: data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: data = hv->hv_reenlightenment_control; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: data = hv->hv_tsc_emulation_control; break; case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: data = hv->hv_invtsc_control; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: data = div64_u64(1000000000ULL, vcpu->kvm->arch.apic_bus_cycle_ns); break; default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); } static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; u64 *bitmap; BUILD_BUG_ON(sizeof(vp_bitmap) > sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); /* * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else * fill a temporary buffer and manually test each vCPU's VP index. */ if (likely(!has_mismatch)) bitmap = (u64 *)vcpu_mask; else bitmap = vp_bitmap; /* * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask * having a '1' for each bank that exists in sparse_banks. Sets must * be in ascending order, i.e. bank0..bankN. */ memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) bitmap[bank] = sparse_banks[sbank++]; if (likely(!has_mismatch)) return; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) __set_bit(i, vcpu_mask); } } static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) { int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; unsigned long sbank; if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) return false; /* * The index into the sparse bank is the number of preceding bits in * the valid mask. Optimize for VMs with <64 vCPUs by skipping the * fancy math if there can't possibly be preceding bits. */ if (valid_bit_nr) sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); else sbank = 0; return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, (unsigned long *)&sparse_banks[sbank]); } struct kvm_hv_hcall { /* Hypercall input data */ u64 param; u64 ingpa; u64 outgpa; u16 code; u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; bool rep; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; /* * Current read offset when KVM reads hypercall input data gradually, * either offset in bytes from 'ingpa' for regular hypercalls or the * number of already consumed 'XMM halves' for 'fast' hypercalls. */ union { gpa_t data_offset; int consumed_xmm_halves; }; }; static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, u16 orig_cnt, u16 cnt_cap, u64 *data) { /* * Preserve the original count when ignoring entries via a "cap", KVM * still needs to validate the guest input (though the non-XMM path * punts on the checks). */ u16 cnt = min(orig_cnt, cnt_cap); int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; for (i = 0; i < cnt; i++) { j = i + hc->consumed_xmm_halves; if (j % 2) data[i] = sse128_hi(hc->xmm[j / 2]); else data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, cnt * sizeof(*data)); } static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 *sparse_banks) { if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) return -EINVAL; /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, sparse_banks); } static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) { return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); } static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, u64 *entries, int count) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; if (!hv_vcpu) return; spin_lock(&tlb_flush_fifo->write_lock); /* * All entries should fit on the fifo leaving one free for 'flush all' * entry in case another request comes in. In case there's not enough * space, just put 'flush all' entry there. */ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); goto out_unlock; } /* * Note: full fifo always contains 'flush all' entry, no need to check the * return value. */ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); out_unlock: spin_unlock(&tlb_flush_fifo->write_lock); } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; int i, j, count; gva_t gva; if (!tdp_enabled || !hv_vcpu) return -EINVAL; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); for (i = 0; i < count; i++) { if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) goto out_flush_all; if (is_noncanonical_invlpg_address(entries[i], vcpu)) continue; /* * Lower 12 bits of 'address' encode the number of additional * pages to flush. */ gva = entries[i] & PAGE_MASK; for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); ++vcpu->stat.tlb_flush; } return 0; out_flush_all: kfifo_reset_out(&tlb_flush_fifo->entries); /* Fall back to full flush. */ return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); unsigned long *vcpu_mask = hv_vcpu->vcpu_mask; u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; /* * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' * entries on the TLB flush fifo. The last entry, however, needs to be * always left free for 'flush all' entry which gets placed when * there is not enough space to put all the requested entries. */ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; u64 *tlb_flush_entries; u64 valid_bank_mask; struct kvm_vcpu *v; unsigned long i; bool all_cpus; /* * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS * sparse banks. Fail the build if KVM's max allowed number of * vCPUs (>4096) exceeds this limit. */ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); /* * 'Slow' hypercall's first parameter is the address in guest's memory * where hypercall parameters are placed. This is either a GPA or a * nested GPA when KVM is handling the call from L2 ('direct' TLB * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ if (!hc->fast && is_guest_mode(vcpu)) { hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { if (hc->fast) { flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); hc->consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags, is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; /* * Work around possible WS2012 bug: it sends hypercalls * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, * while also expecting us to flush something and crashing if * we don't. Let's treat processor_mask == 0 same as * HV_FLUSH_ALL_PROCESSORS. */ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { if (hc->fast) { flush_ex.address_space = hc->ingpa; flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); hc->consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (!all_cpus) { if (!hc->var_cnt) goto ret_success; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } /* * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' * case (HV_GENERIC_SET_ALL). Always adjust data_offset and * consumed_xmm_halves to make sure TLB flush entries are read * from the correct offset. */ if (hc->fast) hc->consumed_xmm_halves += hc->var_cnt; else hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { tlb_flush_entries = NULL; } else { if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) return HV_STATUS_INVALID_HYPERCALL_INPUT; tlb_flush_entries = __tlb_flush_entries; } /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus && !is_guest_mode(vcpu)) { kvm_for_each_vcpu(i, v, kvm) { tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { v = kvm_get_vcpu(kvm, i); if (!v) continue; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } else { struct kvm_vcpu_hv *hv_v; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, v, kvm) { hv_v = to_hv_vcpu(v); /* * The following check races with nested vCPUs entering/exiting * and/or migrating between L1's vCPUs, however the only case when * KVM *must* flush the TLB is when the target L2 vCPU keeps * running on the same L1 vCPU from the moment of the request until * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other * cases, e.g. when the target L2 vCPU migrates to a different L1 * vCPU or when the corresponding L1 vCPU temporary switches to a * different L2 vCPU while the request is being processed. */ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) continue; if (!all_cpus && !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, sparse_banks)) continue; __set_bit(i, vcpu_mask); tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = vector }; struct kvm_vcpu *vcpu; unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (sparse_banks && !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ kvm_apic_set_irq(vcpu, &irq, NULL); } } static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 valid_bank_mask; u32 vector; bool all_cpus; if (!lapic_in_kernel(vcpu)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = hc->outgpa; vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } else { send_ipi_ex.vector = (u32)hc->ingpa; send_ipi_ex.vp_set.format = hc->outgpa; send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]); } trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, send_ipi_ex.vp_set.valid_bank_mask); vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) goto check_and_send_ipi; if (!hc->var_cnt) goto ret_success; if (!hc->fast) hc->data_offset = offsetof(struct hv_send_ipi_ex, vp_set.bank_contents); else hc->consumed_xmm_halves = 1; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); else kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; } void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_cpuid_entry2 *entry; vcpu->arch.hyperv_enabled = hyperv_enabled; if (!hv_vcpu) { /* * KVM should have already allocated kvm_vcpu_hv if Hyper-V is * enabled in CPUID. */ WARN_ON_ONCE(vcpu->arch.hyperv_enabled); return; } memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache)); if (!vcpu->arch.hyperv_enabled) return; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; hv_vcpu->cpuid_cache.features_edx = entry->edx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES); if (entry) { hv_vcpu->cpuid_cache.nested_eax = entry->eax; hv_vcpu->cpuid_cache.nested_ebx = entry->ebx; } } int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) { struct kvm_vcpu_hv *hv_vcpu; int ret = 0; if (!to_hv_vcpu(vcpu)) { if (enforce) { ret = kvm_hv_vcpu_init(vcpu); if (ret) return ret; } else { return 0; } } hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->enforce_cpuid = enforce; return ret; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { kvm_rdx_write(vcpu, result >> 32); kvm_rax_write(vcpu, result & 0xffffffff); } } static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { u32 tlb_lock_count = 0; int ret; if (hv_result_success(result) && is_guest_mode(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu) && kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, &tlb_lock_count, sizeof(tlb_lock_count))) result = HV_STATUS_INVALID_HYPERCALL_INPUT; trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; ret = kvm_skip_emulated_instruction(vcpu); if (tlb_lock_count) kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) { return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; if (unlikely(!hc->fast)) { int ret; gpa_t gpa = hc->ingpa; if ((gpa & (__alignof__(hc->ingpa) - 1)) || offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; ret = kvm_vcpu_read_guest(vcpu, gpa, &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } /* * Per spec, bits 32-47 contain the extra "flag number". However, we * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) { switch (hc->code) { case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: case HVCALL_SEND_IPI_EX: return true; } return false; } static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) { int reg; kvm_fpu_get(); for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) _kvm_read_sse_reg(reg, &hc->xmm[reg]); kvm_fpu_put(); } static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) { if (!hv_vcpu->enforce_cpuid) return true; switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: return hv_vcpu->cpuid_cache.enlightenments_ebx && hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; case HVCALL_POST_MESSAGE: return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; case HVCALL_SIGNAL_EVENT: return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: case HVCALL_RESET_DEBUG_SESSION: /* * Return 'true' when SynDBG is disabled so the resulting code * will be HV_STATUS_INVALID_HYPERCALL_CODE. */ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; case HVCALL_SEND_IPI_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: return hv_vcpu->cpuid_cache.features_ebx & HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } return true; } int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #ifdef CONFIG_X86_64 if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); } else #endif { hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | (kvm_rax_read(vcpu) & 0xffffffff); hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | (kvm_rcx_read(vcpu) & 0xffffffff); hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | (kvm_rsi_read(vcpu) & 0xffffffff); } hc.code = hc.param & 0xffff; hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; goto hypercall_complete; } if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } kvm_hv_hypercall_read_xmm(&hc); } switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_SEND_IPI: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_SEND_IPI_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_send_ipi(vcpu, &hc); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } fallthrough; case HVCALL_RESET_DEBUG_SESSION: { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu)) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { ret = HV_STATUS_OPERATION_DENIED; break; } goto hypercall_userspace_exit; } case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); hypercall_userspace_exit: vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = hc.param; vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; } void kvm_hv_init_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_init(&hv->hv_lock); idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; eventfd = eventfd_ctx_fdget(fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) return 0; if (ret == -ENOSPC) ret = -EEXIST; eventfd_ctx_put(eventfd); return ret; } static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); eventfd = idr_remove(&hv->conn_to_evt, conn_id); mutex_unlock(&hv->hv_lock); if (!eventfd) return -ENOENT; synchronize_srcu(&kvm->srcu); eventfd_ctx_put(eventfd); return 0; } int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) { if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) || (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK)) return -EINVAL; if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN) return kvm_hv_eventfd_deassign(kvm, args->conn_id); return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_INTERFACE }, { .function = HYPERV_CPUID_VERSION }, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); if (cpuid->nent < nent) return -E2BIG; if (cpuid->nent > nent) cpuid->nent = nent; for (i = 0; i < nent; i++) { struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; u32 signature[3]; switch (ent->function) { case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_INTERFACE: ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: /* * We implement some Hyper-V 2016 functions so let's use * this version. */ ent->eax = 0x00003839; ent->ebx = 0x000A0000; break; case HYPERV_CPUID_FEATURES: ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; ent->eax |= HV_MSR_SYNIC_AVAILABLE; ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; ent->eax |= HV_MSR_HYPERCALL_AVAILABLE; ent->eax |= HV_MSR_VP_INDEX_AVAILABLE; ent->eax |= HV_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel * LAPIC */ if (!vcpu || lapic_in_kernel(vcpu)) ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; case HYPERV_CPUID_ENLIGHTMENT_INFO: ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; if (!vcpu || lapic_in_kernel(vcpu)) ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. */ ent->ebx = 0x00000FFF; break; case HYPERV_CPUID_IMPLEMENT_LIMITS: /* Maximum number of virtual processors */ ent->eax = KVM_MAX_VCPUS; /* * Maximum number of logical processors, matches * HyperV 2016. */ ent->ebx = 64; break; case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = 0; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_SYNDBG_INTERFACE: memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); ent->eax = signature[0]; break; case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; break; default: break; } } if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; return 0; }
12 12 1 12 1 12 12 5 9 2 4 11 11 11 10 9 10 4 6 4 7 7 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/u64_stats_sync.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables_offload.h> struct nft_counter { u64_stats_t bytes; u64_stats_t packets; }; struct nft_counter_tot { s64 bytes; s64 packets; }; struct nft_counter_percpu_priv { struct nft_counter __percpu *counter; }; static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); nft_sync = this_cpu_ptr(&nft_counter_sync); u64_stats_update_begin(nft_sync); u64_stats_add(&this_cpu->bytes, pkt->skb->len); u64_stats_inc(&this_cpu->packets); u64_stats_update_end(nft_sync); local_bh_enable(); } static inline void nft_counter_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_counter_percpu_priv *priv = nft_obj_data(obj); nft_counter_do_eval(priv, regs, pkt); } static int nft_counter_do_init(const struct nlattr * const tb[], struct nft_counter_percpu_priv *priv) { struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT); if (cpu_stats == NULL) return -ENOMEM; this_cpu = raw_cpu_ptr(cpu_stats); if (tb[NFTA_COUNTER_PACKETS]) { u64_stats_set(&this_cpu->packets, be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]))); } if (tb[NFTA_COUNTER_BYTES]) { u64_stats_set(&this_cpu->bytes, be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]))); } priv->counter = cpu_stats; return 0; } static int nft_counter_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_counter_percpu_priv *priv = nft_obj_data(obj); return nft_counter_do_init(tb, priv); } static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv) { free_percpu(priv->counter); } static void nft_counter_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_counter_percpu_priv *priv = nft_obj_data(obj); nft_counter_do_destroy(priv); } static void nft_counter_reset(struct nft_counter_percpu_priv *priv, struct nft_counter_tot *total) { struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); nft_sync = this_cpu_ptr(&nft_counter_sync); u64_stats_update_begin(nft_sync); u64_stats_add(&this_cpu->packets, -total->packets); u64_stats_add(&this_cpu->bytes, -total->bytes); u64_stats_update_end(nft_sync); local_bh_enable(); } static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, struct nft_counter_tot *total) { struct nft_counter *this_cpu; u64 bytes, packets; unsigned int seq; int cpu; memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu); this_cpu = per_cpu_ptr(priv->counter, cpu); do { seq = u64_stats_fetch_begin(nft_sync); bytes = u64_stats_read(&this_cpu->bytes); packets = u64_stats_read(&this_cpu->packets); } while (u64_stats_fetch_retry(nft_sync, seq)); total->bytes += bytes; total->packets += packets; } } static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { struct nft_counter_tot total; nft_counter_fetch(priv, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD) || nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets), NFTA_COUNTER_PAD)) goto nla_put_failure; if (reset) nft_counter_reset(priv, &total); return 0; nla_put_failure: return -1; } static int nft_counter_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { struct nft_counter_percpu_priv *priv = nft_obj_data(obj); return nft_counter_do_dump(skb, priv, reset); } static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; struct nft_object_type nft_counter_obj_type; static const struct nft_object_ops nft_counter_obj_ops = { .type = &nft_counter_obj_type, .size = sizeof(struct nft_counter_percpu_priv), .eval = nft_counter_obj_eval, .init = nft_counter_obj_init, .destroy = nft_counter_obj_destroy, .dump = nft_counter_obj_dump, }; struct nft_object_type nft_counter_obj_type __read_mostly = { .type = NFT_OBJECT_COUNTER, .ops = &nft_counter_obj_ops, .maxattr = NFTA_COUNTER_MAX, .policy = nft_counter_policy, .owner = THIS_MODULE, }; void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); nft_counter_do_eval(priv, regs, pkt); } static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); return nft_counter_do_dump(skb, priv, reset); } static int nft_counter_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); return nft_counter_do_init(tb, priv); } static void nft_counter_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); nft_counter_do_destroy(priv); } static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_counter_percpu_priv *priv = nft_expr_priv(src); struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; struct nft_counter_tot total; nft_counter_fetch(priv, &total); cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp); if (cpu_stats == NULL) return -ENOMEM; this_cpu = raw_cpu_ptr(cpu_stats); u64_stats_set(&this_cpu->packets, total.packets); u64_stats_set(&this_cpu->bytes, total.bytes); priv_clone->counter = cpu_stats; return 0; } static int nft_counter_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { /* No specific offload action is needed, but report success. */ return 0; } static void nft_counter_offload_stats(struct nft_expr *expr, const struct flow_stats *stats) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); nft_sync = this_cpu_ptr(&nft_counter_sync); u64_stats_update_begin(nft_sync); u64_stats_add(&this_cpu->packets, stats->pkts); u64_stats_add(&this_cpu->bytes, stats->bytes); u64_stats_update_end(nft_sync); local_bh_enable(); } void nft_counter_init_seqcount(void) { int cpu; for_each_possible_cpu(cpu) u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu)); } struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)), .eval = nft_counter_eval, .init = nft_counter_init, .destroy = nft_counter_destroy, .destroy_clone = nft_counter_destroy, .dump = nft_counter_dump, .clone = nft_counter_clone, .reduce = NFT_REDUCE_READONLY, .offload = nft_counter_offload, .offload_stats = nft_counter_offload_stats, }; struct nft_expr_type nft_counter_type __read_mostly = { .name = "counter", .ops = &nft_counter_ops, .policy = nft_counter_policy, .maxattr = NFTA_COUNTER_MAX, .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, };
6 2 4 4 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_LOADER_H #define __FIRMWARE_LOADER_H #include <linux/bitops.h> #include <linux/firmware.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/completion.h> /** * enum fw_opt - options to control firmware loading behaviour * * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent * when the firmware is not found. Userspace is in charge to load the * firmware using the sysfs loading facility. * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous. * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct * filesystem lookup fails at finding the firmware. For details refer to * firmware_fallback_sysfs(). * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages. * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to * cache the firmware upon suspend, so that upon resume races against the * firmware file lookup on storage is avoided. Used for calls where the * file may be too big, or where the driver takes charge of its own * firmware caching mechanism. * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes * precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER. * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in * the platform's main firmware. If both this fallback and the sysfs * fallback are enabled, then this fallback will be tried first. * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read * entire file. */ enum fw_opt { FW_OPT_UEVENT = BIT(0), FW_OPT_NOWAIT = BIT(1), FW_OPT_USERHELPER = BIT(2), FW_OPT_NO_WARN = BIT(3), FW_OPT_NOCACHE = BIT(4), FW_OPT_NOFALLBACK_SYSFS = BIT(5), FW_OPT_FALLBACK_PLATFORM = BIT(6), FW_OPT_PARTIAL = BIT(7), }; enum fw_status { FW_STATUS_UNKNOWN, FW_STATUS_LOADING, FW_STATUS_DONE, FW_STATUS_ABORTED, }; /* * Concurrent request_firmware() for the same firmware need to be * serialized. struct fw_state is simple state machine which hold the * state of the firmware loading. */ struct fw_state { struct completion completion; enum fw_status status; }; struct fw_priv { struct kref ref; struct list_head list; struct firmware_cache *fwc; struct fw_state fw_st; void *data; size_t size; size_t allocated_size; size_t offset; u32 opt_flags; #ifdef CONFIG_FW_LOADER_PAGED_BUF bool is_paged_buf; struct page **pages; int nr_pages; int page_array_size; #endif #ifdef CONFIG_FW_LOADER_USER_HELPER bool need_uevent; struct list_head pending_list; #endif const char *fw_name; }; extern struct mutex fw_lock; extern struct firmware_cache fw_cache; extern bool fw_load_abort_all; static inline bool __fw_state_check(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; return fw_st->status == status; } static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout) { struct fw_state *fw_st = &fw_priv->fw_st; long ret; ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout); if (ret != 0 && fw_st->status == FW_STATUS_ABORTED) return -ENOENT; if (!ret) return -ETIMEDOUT; return ret < 0 ? ret : 0; } static inline void __fw_state_set(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; WRITE_ONCE(fw_st->status, status); if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) { #ifdef CONFIG_FW_LOADER_USER_HELPER /* * Doing this here ensures that the fw_priv is deleted from * the pending list in all abort/done paths. */ list_del_init(&fw_priv->pending_list); #endif complete_all(&fw_st->completion); } } static inline void fw_state_aborted(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_ABORTED); } static inline bool fw_state_is_aborted(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_ABORTED); } static inline void fw_state_start(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_LOADING); } static inline void fw_state_done(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_done(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_loading(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_LOADING); } int alloc_lookup_fw_priv(const char *fw_name, struct firmware_cache *fwc, struct fw_priv **fw_priv, void *dbuf, size_t size, size_t offset, u32 opt_flags); int assign_fw(struct firmware *fw, struct device *device); void free_fw_priv(struct fw_priv *fw_priv); void fw_state_init(struct fw_priv *fw_priv); #ifdef CONFIG_FW_LOADER bool firmware_is_builtin(const struct firmware *fw); bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size); #else /* module case */ static inline bool firmware_is_builtin(const struct firmware *fw) { return false; } static inline bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { return false; } #endif #ifdef CONFIG_FW_LOADER_PAGED_BUF void fw_free_paged_buf(struct fw_priv *fw_priv); int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed); int fw_map_paged_buf(struct fw_priv *fw_priv); bool fw_is_paged_buf(struct fw_priv *fw_priv); #else static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {} static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; } static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; } static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; } #endif #endif /* __FIRMWARE_LOADER_H */
35 34 35 35 34 35 34 112 113 4 4 4 2 2 2 2 288 3 3 1 3 3 3 3 3 1 3 290 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 1 1 1 1 1 1 1 1 1 2 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_read_value); /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kcalloc(3, sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
102 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0+ /* * IMA support for appraising module-style appended signatures. * * Copyright (C) 2019 IBM Corporation * * Author: * Thiago Jung Bauermann <bauerman@linux.ibm.com> */ #include <linux/types.h> #include <linux/module_signature.h> #include <keys/asymmetric-type.h> #include <crypto/pkcs7.h> #include "ima.h" struct modsig { struct pkcs7_message *pkcs7_msg; enum hash_algo hash_algo; /* This digest will go in the 'd-modsig' field of the IMA template. */ const u8 *digest; u32 digest_size; /* * This is what will go to the measurement list if the template requires * storing the signature. */ int raw_pkcs7_len; u8 raw_pkcs7[] __counted_by(raw_pkcs7_len); }; /* * ima_read_modsig - Read modsig from buf. * * Return: 0 on success, error code otherwise. */ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { const size_t marker_len = strlen(MODULE_SIG_STRING); const struct module_signature *sig; struct modsig *hdr; size_t sig_len; const void *p; int rc; if (buf_len <= marker_len + sizeof(*sig)) return -ENOENT; p = buf + buf_len - marker_len; if (memcmp(p, MODULE_SIG_STRING, marker_len)) return -ENOENT; buf_len -= marker_len; sig = (const struct module_signature *)(p - sizeof(*sig)); rc = mod_check_sig(sig, buf_len, func_tokens[func]); if (rc) return rc; sig_len = be32_to_cpu(sig->sig_len); buf_len -= sig_len + sizeof(*sig); /* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */ hdr = kzalloc(struct_size(hdr, raw_pkcs7, sig_len), GFP_KERNEL); if (!hdr) return -ENOMEM; hdr->raw_pkcs7_len = sig_len; hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); if (IS_ERR(hdr->pkcs7_msg)) { rc = PTR_ERR(hdr->pkcs7_msg); kfree(hdr); return rc; } memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); /* We don't know the hash algorithm yet. */ hdr->hash_algo = HASH_ALGO__LAST; *modsig = hdr; return 0; } /** * ima_collect_modsig - Calculate the file hash without the appended signature. * @modsig: parsed module signature * @buf: data to verify the signature on * @size: data size * * Since the modsig is part of the file contents, the hash used in its signature * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code * calculates a separate one for signature verification. */ void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { int rc; /* * Provide the file contents (minus the appended sig) so that the PKCS7 * code can calculate the file hash. */ size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) + sizeof(struct module_signature); rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size); if (rc) return; /* Ask the PKCS7 code to calculate the file hash. */ rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest, &modsig->digest_size, &modsig->hash_algo); } int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) { return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { *algo = modsig->hash_algo; *digest = modsig->digest; *digest_size = modsig->digest_size; return 0; } int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { *data = &modsig->raw_pkcs7; *data_len = modsig->raw_pkcs7_len; return 0; } void ima_free_modsig(struct modsig *modsig) { if (!modsig) return; pkcs7_free_message(modsig->pkcs7_msg); kfree(modsig); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H #include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/xarray.h> #include <linux/refcount.h> #include <uapi/linux/btrfs_tree.h> #include "locking.h" #include "fs.h" #include "accessors.h" #include "extent-io-tree.h" struct extent_buffer; struct btrfs_block_rsv; struct btrfs_trans_handle; struct btrfs_block_group; /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, READA_BACK, READA_FORWARD, /* * Similar to READA_FORWARD but unlike it: * * 1) It will trigger readahead even for leaves that are not close to * each other on disk; * 2) It also triggers readahead for nodes; * 3) During a search, even when a node or leaf is already in memory, it * will still trigger readahead for other nodes and leaves that follow * it. * * This is meant to be used only when we know we are iterating over the * entire tree or a very large part of it. */ READA_FORWARD_ALWAYS, }; /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point * to any other levels that are present. * * The slots array records the index of the item or block pointer * used while walking the tree. */ struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; /* Keep some upper locks as we walk down. */ unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; /* * Indicate that new item (btrfs_search_slot) is extending already * existing item and ins_len contains only the data size and not item * header (ie. sizeof(struct btrfs_item) is not included). */ unsigned int search_for_extension:1; /* Stop search if any locks need to be taken (for read) */ unsigned int nowait:1; }; #define BTRFS_PATH_AUTO_FREE(path_name) \ struct btrfs_path *path_name __free(btrfs_free_path) = NULL /* * The state of btrfs root */ enum { /* * btrfs_record_root_in_trans is a multi-step process, and it can race * with the balancing code. But the race is very small, and only the * first time the root is added to each transaction. So IN_TRANS_SETUP * is used to tell us when more checks are required */ BTRFS_ROOT_IN_TRANS_SETUP, /* * Set if tree blocks of this root can be shared by other roots. * Only subvolume trees and their reloc trees have this bit set. * Conflicts with TRACK_DIRTY bit. * * This affects two things: * * - How balance works * For shareable roots, we need to use reloc tree and do path * replacement for balance, and need various pre/post hooks for * snapshot creation to handle them. * * While for non-shareable trees, we just simply do a tree search * with COW. * * - How dirty roots are tracked * For shareable roots, btrfs_record_root_in_trans() is needed to * track them, while non-subvolume roots have TRACK_DIRTY bit, they * don't need to set this manually. */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, BTRFS_ROOT_MULTI_LOG_TASKS, BTRFS_ROOT_DIRTY, BTRFS_ROOT_DELETING, /* * Reloc tree is orphan, only kept here for qgroup delayed subtree scan * * Set for the subvolume tree owning the reloc tree. */ BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used for subvolume roots and the tree root. */ BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, /* This root has a drop operation that was started previously. */ BTRFS_ROOT_UNFINISHED_DROP, /* This reloc root needs to have its buffers lockdep class reset. */ BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. */ struct btrfs_qgroup_swapped_blocks { spinlock_t lock; /* RM_EMPTY_ROOT() of above blocks[] */ bool swapped; struct rb_root blocks[BTRFS_MAX_LEVEL]; }; /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ struct btrfs_root { struct rb_node rb_node; struct extent_buffer *node; struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; struct mutex objectid_mutex; spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; /* * Protected by the 'log_mutex' lock but can be read without holding * that lock to avoid unnecessary lock contention, in which case it * should be read using btrfs_get_root_log_transid() except if it's a * log tree in which case it can be directly accessed. Updates to this * field should always use btrfs_set_root_log_transid(), except for log * trees where the field can be updated directly. */ int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; /* * Just be updated when the commit succeeds. Use * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit() * to access this field. */ int last_log_commit; pid_t log_start_pid; u64 last_trans; u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; /* The dirty list is only used by non-shareable roots */ struct list_head dirty_list; struct list_head root_list; /* Xarray that keeps track of in-memory inodes. */ struct xarray inodes; /* Xarray that keeps track of delayed nodes of every inode. */ struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later */ dev_t anon_dev; spinlock_t root_item_lock; refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered * extents waiting to finish IO. */ struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents */ spinlock_t ordered_extent_lock; /* * all of the data=ordered extents pending writeback * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; /* * Not empty if this subvolume root has gone through tree block swap * (relocation) * * Will be used by reloc_control::dirty_subvol_roots. */ struct list_head reloc_dirty_list; /* * Number of currently running SEND ioctls to prevent * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; /* * Number of currently running deduplication operations that have a * destination inode belonging to this root. Protected by the lock * root_item_lock. */ int dedupe_in_progress; /* For exclusion of snapshot creation and nocow writes */ struct btrfs_drew_lock snapshot_lock; atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; wait_queue_head_t qgroup_flush_wait; /* Number of active swapfiles */ atomic_t nr_swapfiles; /* Record pairs of swapped blocks for qgroup */ struct btrfs_qgroup_swapped_blocks swapped_blocks; /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; /* Used in simple quotas, track root during relocation. */ u64 relocation_src_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif }; static inline bool btrfs_root_readonly(const struct btrfs_root *root) { /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } static inline u64 btrfs_root_id(const struct btrfs_root *root) { return root->root_key.objectid; } static inline int btrfs_get_root_log_transid(const struct btrfs_root *root) { return READ_ONCE(root->log_transid); } static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid) { WRITE_ONCE(root->log_transid, log_transid); } static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root) { return READ_ONCE(root->last_log_commit); } static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id) { WRITE_ONCE(root->last_log_commit, commit_id); } static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root) { return READ_ONCE(root->last_trans); } static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid) { WRITE_ONCE(root->last_trans, transid); } /* * Return the generation this root started with. * * Every normal root that is created with root->root_key.offset set to it's * originating generation. If it is a snapshot it is the generation when the * snapshot was created. * * However for TREE_RELOC roots root_key.offset is the objectid of the owning * tree root. Thankfully we copy the root item of the owning tree root, which * has it's last_snapshot set to what we would have root_key.offset set to, so * return that if this is a TREE_RELOC root. */ static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) { if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) return btrfs_root_last_snapshot(&root->root_item); return root->root_key.offset; } /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. */ struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; /* * Set to true when attempting to replace a file range with a new extent * described by this structure, set to false when attempting to clone an * existing extent into a file range. */ bool is_new_extent; /* Indicate if we should update the inode's mtime and ctime. */ bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* * Meaningful only if is_new_extent is true. * Used to track how many extent items we have already inserted in a * subvolume tree that refer to the extent described by this structure, * so that we know when to create a new delayed ref or update an existing * one. */ int insertions; }; /* Arguments for btrfs_drop_extents() */ struct btrfs_drop_extents_args { /* Input parameters */ /* * If NULL, btrfs_drop_extents() will allocate and free its own path. * If 'replace_extent' is true, this must not be NULL. Also the path * is always released except if 'replace_extent' is true and * btrfs_drop_extents() sets 'extent_inserted' to true, in which case * the path is kept locked. */ struct btrfs_path *path; /* Start offset of the range to drop extents from */ u64 start; /* End (exclusive, last byte + 1) of the range to drop extents from */ u64 end; /* If true drop all the extent maps in the range */ bool drop_cache; /* * If true it means we want to insert a new extent after dropping all * the extents in the range. If this is true, the 'extent_item_size' * parameter must be set as well and the 'extent_inserted' field will * be set to true by btrfs_drop_extents() if it could insert the new * extent. * Note: when this is set to true the path must not be NULL. */ bool replace_extent; /* * Used if 'replace_extent' is true. Size of the file extent item to * insert after dropping all existing extents in the range */ u32 extent_item_size; /* Output parameters */ /* * Set to the minimum between the input parameter 'end' and the end * (exclusive, last byte + 1) of the last dropped extent. This is always * set even if btrfs_drop_extents() returns an error. */ u64 drop_end; /* * The number of allocated bytes found in the range. This can be smaller * than the range's length when there are holes in the range. */ u64 bytes_found; /* * Only set if 'replace_extent' is true. Set to true if we were able * to insert a replacement extent after dropping all extents in the * range, otherwise set to false by btrfs_drop_extents(). * Also, if btrfs_drop_extents() has set this to true it means it * returned with the path locked, otherwise if it has set this to * false it has returned with the path released. */ bool extent_inserted; }; struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; /* Task that allocated this structure. */ struct task_struct *owner_task; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { return info->nodesize - sizeof(struct btrfs_header); } static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); } static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); #ifdef __LITTLE_ENDIAN /* * Compare two keys, on little-endian the disk order is same as CPU order and * we can avoid the conversion. */ static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key, const struct btrfs_key *k2) { const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; return btrfs_comp_cpu_keys(k1, k2); } #else /* Compare two keys in a memcmp fashion. */ static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk, const struct btrfs_key *k2) { struct btrfs_key k1; btrfs_disk_key_to_cpu(&k1, disk); return btrfs_comp_cpu_keys(&k1, k2); } #endif int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); int btrfs_force_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size, enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, const struct btrfs_root *root, const struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset); int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key); int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { return btrfs_del_items(trans, root, path, path->slots[0], 1); } /* * Describes a batch of items to insert in a btree. This is used by * btrfs_insert_empty_items(). */ struct btrfs_item_batch { /* * Pointer to an array containing the keys of the items to insert (in * sorted order). */ const struct btrfs_key *keys; /* Pointer to an array containing the data size for each item to insert. */ const u32 *data_sizes; /* * The sum of data sizes for all items. The caller can compute this while * setting up the data_sizes array, so it ends up being more efficient * than having btrfs_insert_empty_items() or setup_item_for_insert() * doing it, as it would avoid an extra loop over a potentially large * array, and in the case of setup_item_for_insert(), we would be doing * it while holding a write lock on a leaf and often on upper level nodes * too, unnecessarily increasing the size of a critical section. */ u32 total_data_size; /* Size of the keys and data_sizes arrays (number of items in the batch). */ int nr; }; void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) { struct btrfs_item_batch batch; batch.keys = key; batch.data_sizes = &data_size; batch.total_data_size = data_size; batch.nr = 1; return btrfs_insert_empty_items(trans, root, path, &batch); } int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); /* * Search in @root for a given @key, and store the slot found in @found_key. * * @root: The root node of the tree. * @key: The key we are looking for. * @found_key: Will hold the found item. * @path: Holds the current slot/leaf. * @iter_ret: Contains the value returned from btrfs_search_slot or * btrfs_get_next_valid_item, whichever was executed last. * * The @iter_ret is an output variable that will contain the return value of * btrfs_search_slot, if it encountered an error, or the value returned from * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid * slot was found, 1 if there were no more leaves, and <0 if there was an error. * * It's recommended to use a separate variable for iter_ret and then use it to * set the function return value so there's no confusion of the 0/1/errno * values stemming from btrfs_search_slot. */ #define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ (iter_ret) >= 0 && \ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ (path)->slots[0]++ \ ) int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); /* * Search the tree again to find a leaf with greater keys. * * Returns 0 if it found something or 1 if there are no greater leaves. * Returns < 0 on error. */ static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { return btrfs_next_old_leaf(root, path, 0); } static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(const struct extent_buffer *leaf); static inline bool btrfs_is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID) return true; if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID) return false; if (btrfs_qgroup_level(rootid) != 0) return false; return true; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } #endif
1 1 1 1 1 1 1 1 1 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 /* * Copyright (c) 2006,2007 The Regents of the University of Michigan. * All rights reserved. * * Andy Adamson <andros@citi.umich.edu> * Fred Isaman <iisaman@umich.edu> * * permission is granted to use, copy, create derivative works and * redistribute this software and such derivative works for any purpose, * so long as the name of the university of michigan is not used in * any advertising or publicity pertaining to the use or distribution * of this software without specific, written prior authorization. if * the above copyright notice or any other identification of the * university of michigan is included in any copy of any portion of * this software, then the disclaimer below must also be included. * * this software is provided as is, without representation from the * university of michigan as to its fitness for any purpose, and without * warranty by the university of michigan of any kind, either express * or implied, including without limitation the implied warranties of * merchantability and fitness for a particular purpose. the regents * of the university of michigan shall not be liable for any damages, * including special, indirect, incidental, or consequential damages, * with respect to any claim arising out or in connection with the use * of the software, even if it has been or is hereafter advised of the * possibility of such damages. */ #include <linux/module.h> #include <linux/blkdev.h> #include "blocklayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD static void nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) { int i; *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(b->type); *p++ = cpu_to_be32(b->simple.nr_sigs); for (i = 0; i < b->simple.nr_sigs; i++) { p = xdr_encode_hyper(p, b->simple.sigs[i].offset); p = xdr_encode_opaque(p, b->simple.sigs[i].sig, b->simple.sigs[i].sig_len); } } dev_t bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, gfp_t gfp_mask) { struct net *net = server->nfs_client->cl_net; struct nfs_net *nn = net_generic(net, nfs_net_id); struct bl_dev_msg *reply = &nn->bl_mount_reply; struct bl_pipe_msg bl_pipe_msg; struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; struct bl_msg_hdr *bl_msg; DECLARE_WAITQUEUE(wq, current); dev_t dev = 0; int rc; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); mutex_lock(&nn->bl_mutex); bl_pipe_msg.bl_wq = &nn->bl_wq; b->simple.len += 4; /* single volume */ if (b->simple.len > PAGE_SIZE) goto out_unlock; memset(msg, 0, sizeof(*msg)); msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) goto out_unlock; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT; bl_msg->totallen = b->simple.len; nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); dprintk("%s CALLING USERSPACE DAEMON\n", __func__); add_wait_queue(&nn->bl_wq, &wq); rc = rpc_queue_upcall(nn->bl_device_pipe, msg); if (rc < 0) { remove_wait_queue(&nn->bl_wq, &wq); goto out_free_data; } set_current_state(TASK_UNINTERRUPTIBLE); schedule(); remove_wait_queue(&nn->bl_wq, &wq); if (reply->status != BL_DEVICE_REQUEST_PROC) { printk(KERN_WARNING "%s failed to decode device: %d\n", __func__, reply->status); goto out_free_data; } dev = MKDEV(reply->major, reply->minor); out_free_data: kfree(msg->data); out_unlock: mutex_unlock(&nn->bl_mutex); return dev; } static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfs_net_id); if (mlen != sizeof (struct bl_dev_msg)) return -EINVAL; if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) return -EFAULT; wake_up(&nn->bl_wq); return mlen; } static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); if (msg->errno >= 0) return; wake_up(bl_pipe_msg->bl_wq); } static const struct rpc_pipe_ops bl_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = bl_pipe_downcall, .destroy_msg = bl_pipe_destroy_msg, }; static int nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { struct dentry *dir; int err; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) return -ENOENT; err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); return err; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); int ret = 0; if (!try_module_get(THIS_MODULE)) return 0; if (nn->bl_device_pipe == NULL) { module_put(THIS_MODULE); return 0; } switch (event) { case RPC_PIPEFS_MOUNT: ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); break; case RPC_PIPEFS_UMOUNT: rpc_unlink(nn->bl_device_pipe); break; default: ret = -ENOTSUPP; break; } module_put(THIS_MODULE); return ret; } static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; int ret; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) return 0; ret = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); return ret; } static void nfs4blocklayout_unregister_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { rpc_unlink(pipe); rpc_put_sb_net(net); } } static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); int err; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); err = nfs4blocklayout_register_net(net, nn->bl_device_pipe); if (unlikely(err)) rpc_destroy_pipe_data(nn->bl_device_pipe); return err; } static void nfs4blocklayout_net_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); rpc_destroy_pipe_data(nn->bl_device_pipe); nn->bl_device_pipe = NULL; } static struct pernet_operations nfs4blocklayout_net_ops = { .init = nfs4blocklayout_net_init, .exit = nfs4blocklayout_net_exit, }; int __init bl_init_pipefs(void) { int ret; ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); if (ret) goto out; ret = register_pernet_subsys(&nfs4blocklayout_net_ops); if (ret) goto out_unregister_notifier; return 0; out_unregister_notifier: rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); out: return ret; } void bl_cleanup_pipefs(void) { rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); unregister_pernet_subsys(&nfs4blocklayout_net_ops); }
2 2 2 2 2 2 2 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 // SPDX-License-Identifier: GPL-2.0-or-later /* * Syncookies implementation for the Linux kernel * * Copyright (C) 1997 Andi Kleen * Based on ideas by D.J.Bernstein and Eric Schenk. */ #include <linux/tcp.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/route.h> static siphash_aligned_key_t syncookie_secret[2]; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) /* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK * stores TCP options: * * MSB LSB * | 31 ... 6 | 5 | 4 | 3 2 1 0 | * | Timestamp | ECN | SACK | WScale | * * When we receive a valid cookie-ACK, we look at the echoed tsval (if * any) to figure out which TCP options we should use for the rebuilt * connection. * * A WScale setting of '0xf' (which is an invalid scaling value) * means that original syn did not include the TCP window scaling option. */ #define TS_OPT_WSCALE_MASK 0xf #define TS_OPT_SACK BIT(4) #define TS_OPT_ECN BIT(5) /* There is no TS_OPT_TIMESTAMP: * if ACK contains timestamp option, we already know it was * requested/supported by the syn/synack exchange. */ #define TSBITS 6 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); return siphash_4u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, count, &syncookie_secret[c]); } /* * when syncookies are in effect and tcp timestamps are enabled we encode * tcp options in the lower bits of the timestamp value that will be * sent in the syn-ack. * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ u64 cookie_init_timestamp(struct request_sock *req, u64 now) { const struct inet_request_sock *ireq = inet_rsk(req); u64 ts, ts_now = tcp_ns_to_ts(false, now); u32 options = 0; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; if (ireq->sack_ok) options |= TS_OPT_SACK; if (ireq->ecn_ok) options |= TS_OPT_ECN; ts = (ts_now >> TSBITS) << TSBITS; ts |= options; if (ts > ts_now) ts -= (1UL << TSBITS); if (tcp_rsk(req)->req_usec_ts) return ts * NSEC_PER_USEC; return ts * NSEC_PER_MSEC; } static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { /* * Compute the secure sequence number. * The output should be: * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). * Where sseq is their sequence number and count increases every * minute by 1. * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } /* * This retrieves the small "data" value from the syncookie. * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * * The count value used to generate the cookie must be less than * MAX_SYNCOOKIE_AGE minutes in the past. * The return value (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq) { u32 diff, count = tcp_cookie_time(); /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; /* Leaving the data behind */ } /* * MSS Values are chosen based on the 2011 paper * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. * Values .. * .. lower than 536 are rare (< 0.2%) * .. between 537 and 1299 account for less than < 1.5% of observed values * .. in the 1300-1349 range account for about 15 to 20% of observed mss values * .. exceeding 1460 are very rare (< 0.04%) * * 1460 is the single most frequently announced mss value (30 to 46% depending * on monitor location). Table must be sorted. */ static __u16 const msstab[] = { 536, 1300, 1440, /* 1440, 1452: PPPoE */ 1460, }; /* * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v4_init_sequence(iph, th, mssp); } /* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; bool own_req; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, NULL, &own_req); if (child) { refcount_set(&req->rsk_refcnt, 1); sock_rps_save_rxhash(child, skb); if (rsk_drop_req(req)) { reqsk_put(req); return child; } if (inet_csk_reqsk_queue_add(sk, req, child)) return child; bh_unlock_sock(child); sock_put(child); } __reqsk_free(req); return NULL; } EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * * return false if we decode a tcp option that is disabled * on the host. */ bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); return true; } if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); const struct tcphdr *th = tcp_hdr(skb); req->num_retrans = 0; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); ireq->ir_mark = inet_request_mark(sk, skb); if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; treq->snt_synack = 0; treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = ntohl(th->ack_seq) - 1; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; treq->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) return mptcp_subflow_init_cookie_req(req, sk, skb); #endif return 0; } #if IS_ENABLED(CONFIG_BPF) struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) { struct request_sock *req = inet_reqsk(skb->sk); skb->sk = NULL; skb->destructor = NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); req = NULL; } return req; } EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, int mss, u32 tsoff) { struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; if (sk_is_mptcp(sk)) req = mptcp_subflow_reqsk_alloc(ops, sk, false); else req = inet_reqsk_alloc(ops, sk, false); if (!req) return NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); return NULL; } ireq = inet_rsk(req); treq = tcp_rsk(req); req->mss = mss; req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; ireq->snd_wscale = tcp_opt->snd_wscale; ireq->tstamp_ok = tcp_opt->saw_tstamp; ireq->sack_ok = tcp_opt->sack_ok; ireq->wscale_ok = tcp_opt->wscale_ok; ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); treq->ts_off = tsoff; return req; } EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; struct rtable *rt; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } tcp_ao_syncookie(sk, skb, req, AF_INET); /* * We need to lookup the route here to get at the correct * window size. We should better make sure that the window size * hasn't changed since we received the original syn, but I see * no easy way to do this. */ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_COMMON_H #define _INET_COMMON_H #include <linux/indirect_call_wrapper.h> #include <linux/net.h> #include <linux/netdev_features.h> #include <linux/types.h> #include <net/sock.h> extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 */ struct msghdr; struct net; struct page; struct sock; struct sockaddr; struct socket; int inet_release(struct socket *sock); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); void inet_splice_eof(struct socket *sock); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) /* Skip CAP_NET_BIND_SERVICE check. */ #define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); static inline void inet_ctl_sock_destroy(struct sock *sk) { if (sk) sock_release(sk->sk_socket); } #define indirect_call_gro_receive(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_2(cb, f2, f1, head, skb); \ }) #endif
49 49 50 50 50 49 49 29 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: LGPL-2.0+ /* * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. * This file is part of the GNU C Library. * Contributed by Paul Eggert (eggert@twinsun.com). * * The GNU C Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * The GNU C Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with the GNU C Library; see the file COPYING.LIB. If not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ /* * Converts the calendar time to broken-down time representation * * 2009-7-14: * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> * 2021-06-02: * Reimplemented by Cassio Neri <cassio.neri@gmail.com> */ #include <linux/time.h> #include <linux/module.h> #include <linux/kernel.h> #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset: offset seconds adding to totalsecs. * @result: pointer to struct tm variable to receive broken-down time */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day; u64 u64tmp, udays, century, year; bool is_Jan_or_Feb, is_leap_year; long days, rem; int remainder; days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; --days; } while (rem >= SECS_PER_DAY) { rem -= SECS_PER_DAY; ++days; } result->tm_hour = rem / SECS_PER_HOUR; rem %= SECS_PER_HOUR; result->tm_min = rem / 60; result->tm_sec = rem % 60; /* January 1, 1970 was a Thursday. */ result->tm_wday = (4 + days) % 7; if (result->tm_wday < 0) result->tm_wday += 7; /* * The following algorithm is, basically, Proposition 6.3 of Neri * and Schneider [1]. In a few words: it works on the computational * (fictitious) calendar where the year starts in March, month = 2 * (*), and finishes in February, month = 13. This calendar is * mathematically convenient because the day of the year does not * depend on whether the year is leap or not. For instance: * * March 1st 0-th day of the year; * ... * April 1st 31-st day of the year; * ... * January 1st 306-th day of the year; (Important!) * ... * February 28th 364-th day of the year; * February 29th 365-th day of the year (if it exists). * * After having worked out the date in the computational calendar * (using just arithmetics) it's easy to convert it to the * corresponding date in the Gregorian calendar. * * [1] "Euclidean Affine Functions and Applications to Calendar * Algorithms". https://arxiv.org/abs/2102.06959 * * (*) The numbering of months follows tm more closely and thus, * is slightly different from [1]. */ udays = ((u64) days) + 2305843009213814918ULL; u64tmp = 4 * udays + 3; century = div64_u64_rem(u64tmp, 146097, &u64tmp); day_of_century = (u32) (u64tmp / 4); u32tmp = 4 * day_of_century + 3; u64tmp = 2939745ULL * u32tmp; year_of_century = upper_32_bits(u64tmp); day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; year = 100 * century + year_of_century; is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4); u32tmp = 2141 * day_of_year + 132377; month = u32tmp >> 16; day = ((u16) u32tmp) / 2141; /* * Recall that January 1st is the 306-th day of the year in the * computational (not Gregorian) calendar. */ is_Jan_or_Feb = day_of_year >= 306; /* Convert to the Gregorian calendar and adjust to Unix time. */ year = year + is_Jan_or_Feb - 6313183731940000ULL; month = is_Jan_or_Feb ? month - 12 : month; day = day + 1; day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year; /* Convert to tm's format. */ result->tm_year = (long) (year - 1900); result->tm_mon = (int) month; result->tm_mday = (int) day; result->tm_yday = (int) day_of_year; } EXPORT_SYMBOL(time64_to_tm);
9 9 23 19 13 4 3 9 6 4 3 3 3 6 6 4 2 2 6 1 23 8 23 22 23 23 9 9 9 1 1 8 8 8 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 /* * umip.c Emulation for instruction protected by the User-Mode Instruction * Prevention feature * * Copyright (c) 2017, Intel Corporation. * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> */ #include <linux/uaccess.h> #include <asm/umip.h> #include <asm/traps.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <linux/ratelimit.h> #undef pr_fmt #define pr_fmt(fmt) "umip: " fmt /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * * User-Mode Instruction Prevention is a security feature present in recent * x86 processors that, when enabled, prevents a group of instructions (SGDT, * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general * protection fault if the instruction is executed with CPL > 0. * * Rather than relaying to the user space the general protection fault caused by * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be * trapped and emulate the result of such instructions to provide dummy values. * This allows to both conserve the current kernel behavior and not reveal the * system resources that UMIP intends to protect (i.e., the locations of the * global descriptor and interrupt descriptor tables, the segment selectors of * the local descriptor table, the value of the task state register and the * contents of the CR0 register). * * This emulation is needed because certain applications (e.g., WineHQ and * DOSEMU2) rely on this subset of instructions to function. * * The instructions protected by UMIP can be split in two groups. Those which * return a kernel memory address (SGDT and SIDT) and those which return a * value (SLDT, STR and SMSW). * * For the instructions that return a kernel memory address, applications * such as WineHQ rely on the result being located in the kernel memory space, * not the actual location of the table. The result is emulated as a hard-coded * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * * The instruction SMSW is emulated to return the value that the register CR0 * has at boot time as set in the head_32. * SLDT and STR are emulated to return the values that the kernel programmatically * assigns: * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not. * - STR returns (GDT_ENTRY_TSS * 8). * * Emulation is provided for both 32-bit and 64-bit processes. * * Care is taken to appropriately emulate the results when segmentation is * used. That is, rather than relying on USER_DS and USER_CS, the function * insn_get_addr_ref() inspects the segment descriptor pointed by the * registers in pt_regs. This ensures that we correctly obtain the segment * base address and the address and operand sizes even if the user space * application uses a local descriptor table. */ #define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL #define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL /* * The SGDT and SIDT instructions store the contents of the global descriptor * table and interrupt table registers, respectively. The destination is a * memory operand of X+2 bytes. X bytes are used to store the base address of * the table and 2 bytes are used to store the limit. In 32-bit processes X * has a value of 4, in 64-bit processes X has a value of 8. */ #define UMIP_GDT_IDT_BASE_SIZE_64BIT 8 #define UMIP_GDT_IDT_BASE_SIZE_32BIT 4 #define UMIP_GDT_IDT_LIMIT_SIZE 2 #define UMIP_INST_SGDT 0 /* 0F 01 /0 */ #define UMIP_INST_SIDT 1 /* 0F 01 /1 */ #define UMIP_INST_SMSW 2 /* 0F 01 /4 */ #define UMIP_INST_SLDT 3 /* 0F 00 /0 */ #define UMIP_INST_STR 4 /* 0F 00 /1 */ static const char * const umip_insns[5] = { [UMIP_INST_SGDT] = "SGDT", [UMIP_INST_SIDT] = "SIDT", [UMIP_INST_SMSW] = "SMSW", [UMIP_INST_SLDT] = "SLDT", [UMIP_INST_STR] = "STR", }; #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) #define umip_pr_debug(regs, fmt, ...) \ umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__) /** * umip_printk() - Print a rate-limited message * @regs: Register set with the context in which the warning is printed * @log_level: Kernel log level to print the message * @fmt: The text string to print * * Print the text contained in @fmt. The print rate is limited to bursts of 5 * messages every two minutes. The purpose of this customized version of * printk() is to print messages when user space processes use any of the * UMIP-protected instructions. Thus, the printed text is prepended with the * task name and process ID number of the current task as well as the * instruction and stack pointers in @regs as seen when entering kernel mode. * * Returns: * * None. */ static __printf(3, 4) void umip_printk(const struct pt_regs *regs, const char *log_level, const char *fmt, ...) { /* Bursts of 5 messages every two minutes */ static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5); struct task_struct *tsk = current; struct va_format vaf; va_list args; if (!__ratelimit(&ratelimit)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, &vaf); va_end(args); } /** * identify_insn() - Identify a UMIP-protected instruction * @insn: Instruction structure with opcode and ModRM byte. * * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected * instruction that can be emulated. * * Returns: * * On success, a constant identifying a specific UMIP-protected instruction that * can be emulated. * * -EINVAL on error or when not an UMIP-protected instruction that can be * emulated. */ static int identify_insn(struct insn *insn) { /* By getting modrm we also get the opcode. */ insn_get_modrm(insn); if (!insn->modrm.nbytes) return -EINVAL; /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: /* The reg form of 0F 01 /0 encodes VMX instructions. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) return -EINVAL; return UMIP_INST_SGDT; case 1: /* * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, * STAC/CLAC, and ENCLS. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) return -EINVAL; return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW; default: return -EINVAL; } } else if (insn->opcode.bytes[1] == 0x0) { if (X86_MODRM_REG(insn->modrm.value) == 0) return UMIP_INST_SLDT; else if (X86_MODRM_REG(insn->modrm.value) == 1) return UMIP_INST_STR; else return -EINVAL; } else { return -EINVAL; } } /** * emulate_umip_insn() - Emulate UMIP instructions and return dummy values * @insn: Instruction structure with operands * @umip_inst: A constant indicating the instruction to emulate * @data: Buffer into which the dummy result is stored * @data_size: Size of the emulated result * @x86_64: true if process is 64-bit, false otherwise * * Emulate an instruction protected by UMIP and provide a dummy result. The * result of the emulation is saved in @data. The size of the results depends * on both the instruction and type of operand (register vs memory address). * The size of the result is updated in @data_size. Caller is responsible * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE + * UMIP_GDT_IDT_LIMIT_SIZE bytes. * * Returns: * * 0 on success, -EINVAL on error while emulating. */ static int emulate_umip_insn(struct insn *insn, int umip_inst, unsigned char *data, int *data_size, bool x86_64) { if (!data || !data_size || !insn) return -EINVAL; /* * These two instructions return the base address and limit of the * global and interrupt descriptor table, respectively. According to the * Intel Software Development manual, the base address can be 24-bit, * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is * 16-bit, the returned value of the base address is supposed to be a * zero-extended 24-byte number. However, it seems that a 32-byte number * is always returned irrespective of the operand size. */ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { u64 dummy_base_addr; u16 dummy_limit = 0; /* SGDT and SIDT do not use registers operands. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) return -EINVAL; if (umip_inst == UMIP_INST_SGDT) dummy_base_addr = UMIP_DUMMY_GDT_BASE; else dummy_base_addr = UMIP_DUMMY_IDT_BASE; /* * 64-bit processes use the entire dummy base address. * 32-bit processes use the lower 32 bits of the base address. * dummy_base_addr is always 64 bits, but we memcpy the correct * number of bytes from it to the destination. */ if (x86_64) *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT; else *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT; memcpy(data + 2, &dummy_base_addr, *data_size); *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT || umip_inst == UMIP_INST_STR) { unsigned long dummy_value; if (umip_inst == UMIP_INST_SMSW) { dummy_value = CR0_STATE; } else if (umip_inst == UMIP_INST_STR) { dummy_value = GDT_ENTRY_TSS * 8; } else if (umip_inst == UMIP_INST_SLDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL down_read(&current->mm->context.ldt_usr_sem); if (current->mm->context.ldt) dummy_value = GDT_ENTRY_LDT * 8; else dummy_value = 0; up_read(&current->mm->context.ldt_usr_sem); #else dummy_value = 0; #endif } /* * For these 3 instructions, the number * of bytes to be copied in the result buffer is determined * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least * significant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; else *data_size = 2; memcpy(data, &dummy_value, *data_size); } else { return -EINVAL; } return 0; } /** * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR * @addr: Address that caused the signal * @regs: Register set containing the instruction pointer * * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is * intended to be used to provide a segmentation fault when the result of the * UMIP emulation could not be copied to the user space memory. * * Returns: none */ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) { struct task_struct *tsk = current; tsk->thread.cr2 = (unsigned long)addr; tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; tsk->thread.trap_nr = X86_TRAP_PF; force_sig_fault(SIGSEGV, SEGV_MAPERR, addr); if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) return; umip_pr_err(regs, "segfault in emulation. error%x\n", X86_PF_USER | X86_PF_WRITE); } /** * fixup_umip_exception() - Fixup a general protection fault caused by UMIP * @regs: Registers as saved when entering the #GP handler * * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection * fault if executed with CPL > 0 (i.e., from user space). This function fixes * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR * and SLDT are not fixed up. * * If operands are memory addresses, results are copied to user-space memory as * indicated by the instruction pointed by eIP using the registers indicated in * the instruction operands. If operands are registers, results are copied into * the context that was saved when entering kernel mode. * * Returns: * * True if emulation was successful; false if not. */ bool fixup_umip_exception(struct pt_regs *regs) { int nr_copied, reg_offset, dummy_data_size, umip_inst; /* 10 bytes is the maximum size of the result of UMIP instructions */ unsigned char dummy_data[10] = { 0 }; unsigned char buf[MAX_INSN_SIZE]; unsigned long *reg_addr; void __user *uaddr; struct insn insn; if (!regs) return false; /* * Give up on emulation if fetching the instruction failed. Should a * page fault or a #GP be issued? */ nr_copied = insn_fetch_from_user(regs, buf); if (nr_copied <= 0) return false; if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) return false; umip_inst = identify_insn(&insn); if (umip_inst < 0) return false; umip_pr_debug(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) return false; /* * If operand is a register, write result to the copy of the register * value that was pushed to the stack when entering into kernel mode. * Upon exit, the value we write will be restored to the actual hardware * register. */ if (X86_MODRM_MOD(insn.modrm.value) == 3) { reg_offset = insn_get_modrm_rm_off(&insn, regs); /* * Negative values are usually errors. In memory addressing, * the exception is -EDOM. Since we expect a register operand, * all negative values are errors. */ if (reg_offset < 0) return false; reg_addr = (unsigned long *)((unsigned long)regs + reg_offset); memcpy(reg_addr, dummy_data, dummy_data_size); } else { uaddr = insn_get_addr_ref(&insn, regs); if ((unsigned long)uaddr == -1L) return false; nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); if (nr_copied > 0) { /* * If copy fails, send a signal and tell caller that * fault was fixed up. */ force_sig_info_umip_fault(uaddr, regs); return true; } } /* increase IP to let the program keep going */ regs->ip += insn.length; return true; }
172 171 276 270 270 1 271 490 487 485 216 13 50 239 480 88 35 96 2 1 173 121 482 63 8 8 2 486 3 2 1 9 9 1 2 6 8 6 2 8 7 2 337 485 486 46 9 1 25 2 9 46 1 4145 4150 1592 128 4152 2894 292 290 6 4144 1035 276 1 736 735 1 4169 2 123 5 1 172 172 614 480 620 2234 2230 1085 1077 759 2069 1985 1980 193 280 275 274 276 274 1095 13 1093 2890 2905 4157 489 46 73 74 26 4683 4678 4615 4715 4234 4193 154 6 4192 4184 4160 4645 759 760 754 4682 146 16 16 15 16 16 16 16 4627 90 76 89 1242 1242 1248 1237 9 1226 1242 1242 197 197 196 195 198 567 9 564 11 11 11 779 786 779 780 776 2001 1227 167 884 883 784 788 789 11 11 2003 1208 33 2023 1999 2009 1213 1221 1211 1209 34 33 33 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
4 128 333 3 18 342 4 4 11 10 7 2 5 1 3 7 1 1 2 218 2 1 2 217 8 1 26 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2009 */ #include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) struct insn_field { union { insn_value_t value; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; } #else struct insn_field { insn_value_t value; union { insn_value_t little; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; static inline void insn_field_set(struct insn_field *p, insn_value_t v, unsigned char n) { p->value = v; p->little = __cpu_to_le32(v); p->nbytes = n; } static inline void insn_set_byte(struct insn_field *p, unsigned char n, insn_byte_t v) { p->bytes[n] = v; p->value = __le32_to_cpu(p->little); } #endif struct insn { struct insn_field prefixes; /* * Prefixes * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ union { struct insn_field vex_prefix; /* VEX prefix */ struct insn_field xop_prefix; /* XOP prefix */ }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 * opcode.bytes[2]: opcode3 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; union { struct insn_field immediate; struct insn_field moffset1; /* for 64bit MOV */ struct insn_field immediate1; /* for 64bit imm or off16/32 */ }; union { struct insn_field moffset2; /* for 64bit MOV */ struct insn_field immediate2; /* for 64bit imm or seg16 */ }; int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; unsigned char length; unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; #define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) #define X86_MODRM_RM(modrm) ((modrm) & 0x07) #define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) #define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ #define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ #define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ #define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ #define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ #define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ #define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ #define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ #define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ #define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ #define X86_EVEX_M(vex) ((vex) & 0x07) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ /* XOP bit fields */ #define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ #define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ #define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ #define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ #define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ #define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ #define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ #define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ #define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ #define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); extern int insn_get_opcode(struct insn *insn); extern int insn_get_modrm(struct insn *insn); extern int insn_get_sib(struct insn *insn); extern int insn_get_displacement(struct insn *insn); extern int insn_get_immediate(struct insn *insn); extern int insn_get_length(struct insn *insn); enum insn_mode { INSN_MODE_32, INSN_MODE_64, /* Mode is determined by the current kernel build. */ INSN_MODE_KERN, INSN_NUM_MODES, }; extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); #define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { insn_get_modrm(insn); } /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); static inline int insn_is_rex2(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return insn->rex_prefix.nbytes == 2; } static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) { return X86_REX2_M(insn->rex_prefix.bytes[1]); } static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.value != 0); } static inline int insn_is_evex(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.nbytes == 4); } /* If we already know this is AVX/XOP encoded */ static inline int avx_insn_is_xop(struct insn *insn) { insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); return inat_is_xop_prefix(attr); } static inline int insn_is_xop(struct insn *insn) { if (!insn_is_avx_or_xop(insn)) return 0; return avx_insn_is_xop(insn); } static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; } static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); else /* EVEX */ return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX_P(insn->vex_prefix.bytes[1]); else return X86_VEX_P(insn->vex_prefix.bytes[2]); } static inline insn_byte_t insn_vex_w_bit(struct insn *insn) { if (insn->vex_prefix.nbytes < 3) return 0; return X86_VEX_W(insn->vex_prefix.bytes[2]); } static inline insn_byte_t insn_xop_map_bits(struct insn *insn) { if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ return 0; return X86_XOP_M(insn->xop_prefix.bytes[1]); } static inline insn_byte_t insn_xop_p_bits(struct insn *insn) { return X86_XOP_P(insn->vex_prefix.bytes[2]); } /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { if (insn_is_avx_or_xop(insn)) { if (avx_insn_is_xop(insn)) return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); return 0; } /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } static inline int insn_offset_opcode(struct insn *insn) { return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; } static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; } static inline int insn_offset_sib(struct insn *insn) { return insn_offset_modrm(insn) + insn->modrm.nbytes; } static inline int insn_offset_displacement(struct insn *insn) { return insn_offset_sib(insn) + insn->sib.nbytes; } static inline int insn_offset_immediate(struct insn *insn) { return insn_offset_displacement(insn) + insn->displacement.nbytes; } /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix * and the index is stored in @idx (note that this @idx is just for a cursor, * do not change it.) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ #define for_each_insn_prefix(insn, idx, prefix) \ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e /* * Intel SDM Vol.3A 6.8.3 states; * "Any single-step trap that would be delivered following the MOV to SS * instruction or POP to SS instruction (because EFLAGS.TF is 1) is * suppressed." * This function returns true if @insn is MOV SS or POP SS. On these * instructions, single stepping is suppressed. */ static inline int insn_masking_exception(struct insn *insn) { return insn->opcode.bytes[0] == POP_SS_OPCODE || (insn->opcode.bytes[0] == MOV_SREG_OPCODE && X86_MODRM_REG(insn->modrm.bytes[0]) == 2); } #endif /* _ASM_X86_INSN_H */
8 8 9 9 8 9 9 9 9 8 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) { vc->cycle_last = base->cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vc->max_cycles = base->clock->max_cycles; #endif vc->mask = base->mask; vc->mult = base->mult; vc->shift = base->shift; } static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_time_data *vdata = vdso_k_time_data; struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vc[CS_HRES_COARSE].clock_mode = clock_mode; vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). */ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { struct vdso_time_data *vdata = vdso_k_time_data; vdata->tz_minuteswest = sys_tz.tz_minuteswest; vdata->tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_time_data(vdata); } #ifdef CONFIG_POSIX_AUX_CLOCKS void vdso_time_update_aux(struct timekeeper *tk) { struct vdso_time_data *vdata = vdso_k_time_data; struct vdso_timestamp *vdso_ts; struct vdso_clock *vc; s32 clock_mode; u64 nsec; vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; vdso_ts = &vc->basetime[VDSO_BASE_AUX]; clock_mode = tk->tkr_mono.clock->vdso_clock_mode; if (!tk->clock_valid) clock_mode = VDSO_CLOCKMODE_NONE; /* copy vsyscall data */ vdso_write_begin_clock(vc); vc->clock_mode = clock_mode; if (clock_mode != VDSO_CLOCKMODE_NONE) { fill_clock_configuration(vc, &tk->tkr_mono); vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec += tk->monotonic_to_aux.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); nsec = nsec << tk->tkr_mono.shift; vdso_ts->nsec = nsec; } __arch_update_vdso_clock(vc); vdso_write_end_clock(vc); __arch_sync_vdso_time_data(vdata); } #endif /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); }
1 2 2 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ /* 6 bucketsize, initval support added */ #define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ /* Member elements */ struct hash_ipport4_elem { __be32 ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { next->ip = d->ip; next->port = d->port; } #define MTYPE hash_ipport4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipport4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipport6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipport_init(void) { return ip_set_type_register(&hash_ipport_type); } static void __exit hash_ipport_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } module_init(hash_ipport_init); module_exit(hash_ipport_fini);
4 3 3 3 3 3 3 2 2 2 2 8 8 8 1 1 1 1 1 1 10 10 10 13 13 13 13 13 13 11 11 11 11 1 1 1 1 1 1 1 1 8 8 1 8 1 8 8 8 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/proc/net.c * * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmission.com> * * proc net directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> #include "internal.h" static inline struct net *PDE_NET(struct proc_dir_entry *pde) { return pde->parent->data; } static struct net *get_proc_net(const struct inode *inode) { return maybe_get_net(PDE_NET(PDE(inode))); } static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; struct net *net; WARN_ON_ONCE(state_size < sizeof(*p)); if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) return -EACCES; net = get_proc_net(inode); if (!net) return -ENXIO; p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); if (!p) { put_net(net); return -ENOMEM; } #ifdef CONFIG_NET_NS p->net = net; netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } static void seq_file_net_put_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS struct seq_net_private *priv = seq->private; put_net_track(priv->net, &priv->ns_tracker); #else put_net(&init_net); #endif } static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; seq_file_net_put_net(seq); seq_release_private(ino, f); return 0; } static const struct proc_ops proc_net_seq_ops = { .proc_open = seq_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = seq_release_net, }; int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, GFP_KERNEL); #endif return 0; } void bpf_iter_fini_seq_net(void *priv_data) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; put_net_track(p->net, &p->ns_tracker); #endif } struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data); /** * proc_create_net_data_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. * @state_size: The size of the per-file private state to allocate. * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * series of elements and also provides for the file accepting writes that have * some arbitrary effect. * * The functions in the @ops table are used to iterate over items to be * presented and extract the readable content using the seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, proc_write_t write, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data_write); static int single_open_net(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); struct net *net; int err; net = get_proc_net(inode); if (!net) return -ENXIO; err = single_open(file, de->single_show, net); if (err) put_net(net); return err; } static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } static const struct proc_ops proc_net_single_ops = { .proc_open = single_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single); /** * proc_create_net_single_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * single element rather than a series and also provides for the file accepting * writes that have some arbitrary effect. * * The @show function is called to extract the readable content via the * seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), proc_write_t write, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single_write); static struct net *get_proc_task_net(struct inode *dir) { struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { task_lock(task); ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); task_unlock(task); } rcu_read_unlock(); return net; } static struct dentry *proc_tgid_net_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; } static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct net *net; net = get_proc_task_net(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; put_net(net); } return 0; } const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; } const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; kuid_t uid; kgid_t gid; int err; /* * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. * Corresponding inode (PDE(inode) == net->proc_net) is never * instantiated therefore blanket zeroing is fine. * net->proc_net_stat inode is instantiated normally. */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); if (!uid_valid(uid)) uid = netd->uid; gid = make_kgid(net->user_ns, 0); if (!gid_valid(gid)) gid = netd->gid; proc_set_user(netd, uid, gid); /* Seed dentry revalidation for /proc/${pid}/net */ pde_force_lookup(netd); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) goto free_net; net->proc_net = netd; net->proc_net_stat = net_statd; return 0; free_net: pde_free(netd); out: return err; } static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { .init = proc_net_ns_init, .exit = proc_net_ns_exit, }; int __init proc_net_init(void) { proc_symlink("net", NULL, "self/net"); return register_pernet_subsys(&proc_net_ns_ops); }
27 2 2 2 45 2 50 5 2 1 1 38 14 23 45 46 46 46 38 37 37 46 38 37 38 38 2 38 6 6 5 5 38 38 19 19 15 15 4 28 22 22 14 10 1 9 26 10 2 1 8 6 5 3 4 4 38 25 1 33 33 33 10 1 9 22 4 3 3 5 25 23 22 2 21 38 4 4 1 1 1 27 25 26 26 26 13 26 10 1 9 26 3 3 3 3 1 3 3 54 3 3 1 1 3 3 3 3 55 54 2 53 9 22 46 46 46 46 57 2 57 57 4 4 13 4 1 3 54 19 13 1 13 55 46 46 6 6 43 42 6 46 47 11 41 41 41 41 47 47 7 1 47 47 40 47 47 46 13 38 61 15 56 45 31 46 2 46 15 15 4 4 4 13 10 9 10 3 2 3 3 3 19 10 9 19 4 19 19 19 19 11 2 11 11 5 6 6 5 5 2 4 11 12 10 9 10 12 10 3 10 6 6 6 3 3 3 1 1 1 1 1 2 2 6 6 3 3 6 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/hashtable.h> #include <linux/io_uring.h> #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "alloc_cache.h" #include "refs.h" #include "napi.h" #include "opdef.h" #include "kbuf.h" #include "poll.h" #include "cancel.h" struct io_poll_update { struct file *file; u64 old_user_data; u64 new_user_data; __poll_t events; bool update_events; bool update_user_data; }; struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; int nr_entries; int error; bool owning; /* output value, set only if arm poll returns >0 */ __poll_t result_mask; }; #define IO_POLL_CANCEL_FLAG BIT(31) #define IO_POLL_RETRY_FLAG BIT(30) #define IO_POLL_REF_MASK GENMASK(29, 0) /* * We usually have 1-2 refs taken, 128 is more than enough and we want to * maximise the margin between this amount and the moment when it overflows. */ #define IO_POLL_REF_BIAS 128 #define IO_WQE_F_DOUBLE 1 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key); static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE); } static inline bool wqe_is_double(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; return priv & IO_WQE_F_DOUBLE; } static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) { int v; /* * poll_refs are already elevated and we don't have much hope for * grabbing the ownership. Instead of incrementing set a retry flag * to notify the loop that there might have been some change. */ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); if (v & IO_POLL_REF_MASK) return false; return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not * owning it, that prevents from races for enqueueing task_work's and b/w * arming poll and wakeups. */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } static void io_poll_mark_cancelled(struct io_kiocb *req) { atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } static struct io_poll *io_poll_get_double(struct io_kiocb *req) { /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) return req->async_data; return req->apoll->double_poll; } static struct io_poll *io_poll_get_single(struct io_kiocb *req) { if (req->opcode == IORING_OP_POLL_ADD) return io_kiocb_to_cmd(req, struct io_poll); return &req->apoll->poll; } static void io_poll_req_insert(struct io_kiocb *req) { struct io_hash_table *table = &req->ctx->cancel_table; u32 index = hash_long(req->cqe.user_data, table->hash_bits); lockdep_assert_held(&req->ctx->uring_lock); hlist_add_head(&req->hash_node, &table->hbs[index].list); } static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); } static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); if (head) { spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); poll->head = NULL; spin_unlock_irq(&head->lock); } } static void io_poll_remove_entries(struct io_kiocb *req) { /* * Nothing to do if neither of those flags are set. Avoid dipping * into the poll/apoll/double cachelines if we can. */ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) return; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); if (req->flags & REQ_F_SINGLE_POLL) io_poll_remove_entry(io_poll_get_single(req)); if (req->flags & REQ_F_DOUBLE_POLL) io_poll_remove_entry(io_poll_get_double(req)); rcu_read_unlock(); } enum { IOU_POLL_DONE = 0, IOU_POLL_NO_ACTION = 1, IOU_POLL_REMOVE_POLL_USE_RES = 2, IOU_POLL_REISSUE = 3, IOU_POLL_REQUEUE = 4, }; static void __io_poll_execute(struct io_kiocb *req, int mask) { unsigned flags = 0; io_req_set_res(req, mask, 0); req->io_task_work.func = io_poll_task_func; trace_io_uring_task_add(req, mask); if (!(req->flags & REQ_F_POLL_NO_LAZY)) flags = IOU_F_TWQ_LAZY_WAKE; __io_req_task_work_add(req, flags); } static inline void io_poll_execute(struct io_kiocb *req, int res) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res); } /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action * require, which is either spurious wakeup or multishot CQE is served. * IOU_POLL_DONE when it's done with the request, then the mask is stored in * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; if (unlikely(io_should_terminate_tw(req->ctx))) return -ECANCELED; do { v = atomic_read(&req->poll_refs); if (unlikely(v != 1)) { /* tw should be the owner and so have some refs */ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) return IOU_POLL_NO_ACTION; if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; /* * cqe.res contains only events of the first wake up * and all others are to be lost. Redo vfs_poll() to get * up to date state. */ if ((v & IO_POLL_REF_MASK) != 1) req->cqe.res = 0; if (v & IO_POLL_RETRY_FLAG) { req->cqe.res = 0; /* * We won't find new events that came in between * vfs_poll and the ref put unless we clear the * flag in advance. */ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); v &= ~IO_POLL_RETRY_FLAG; } } /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; /* * We got woken with a mask, but someone else got to * it first. The above vfs_poll() doesn't add us back * to the waitqueue, so if we get nothing back, we * should be safe and attempt a reissue. */ if (unlikely(!req->cqe.res)) { /* Multishot armed need not reissue */ if (!(req->apoll_events & EPOLLONESHOT)) continue; return IOU_POLL_REISSUE; } } if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { int ret = io_poll_issue(req, tw); if (ret == IOU_COMPLETE) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; if (ret != IOU_RETRY && ret < 0) return ret; } /* force the next iteration to vfs_poll() */ req->cqe.res = 0; /* * Release all references, retry if someone tried to restart * task_work while we were executing it. */ v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); io_napi_add(req); return IOU_POLL_NO_ACTION; } void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) { int ret; ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { return; } else if (ret == IOU_POLL_REQUEUE) { __io_poll_execute(req, 0); return; } io_poll_remove_entries(req); /* task_work always has ->uring_lock held */ hash_del(&req->hash_node); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { struct io_poll *poll; poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { io_req_task_submit(req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; req_set_fail(req); } io_req_set_res(req, req->cqe.res, 0); io_req_task_complete(req, tw); } else { io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) io_req_task_complete(req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) io_req_task_submit(req, tw); else io_req_defer_failed(req, ret); } } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ io_poll_execute(req, 0); } #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ io_poll_execute(req, 0); /* * If the waitqueue is being freed early but someone is already * holds ownership over it, we have to tear down the request as * best we can. That means immediately removing the request from * its waitqueue and preventing all further accesses to the * waitqueue via the request. */ list_del_init(&poll->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&poll->head, NULL); return 1; } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wqe_to_req(wait); struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); if (unlikely(mask & POLLFREE)) return io_pollfree_wake(req, poll); /* for instances that support it check for an event match first */ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { /* * If we trigger a multishot poll off our own wakeup path, * disable multishot as there is a circular dependency between * CQ posting and triggering the event. */ if (mask & EPOLL_URING_WAKE) poll->events |= EPOLLONESHOT; /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; if (wqe_is_double(wait)) req->flags &= ~REQ_F_DOUBLE_POLL; else req->flags &= ~REQ_F_SINGLE_POLL; } __io_poll_execute(req, mask); } return 1; } /* fails only when polling is already completing by the first entry */ static bool io_poll_double_prepare(struct io_kiocb *req) { struct wait_queue_head *head; struct io_poll *poll = io_poll_get_single(req); /* head is RCU protected, see io_poll_remove_entries() comments */ rcu_read_lock(); head = smp_load_acquire(&poll->head); /* * poll arm might not hold ownership and so race for req->flags with * io_poll_wake(). There is only one poll entry queued, serialise with * it by taking its head lock. As we're still arming the tw hanlder * is not going to be run, so there are no races with it. */ if (head) { spin_lock_irq(&head->lock); req->flags |= REQ_F_DOUBLE_POLL; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; spin_unlock_irq(&head->lock); } rcu_read_unlock(); return !!head; } static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll **poll_ptr) { struct io_kiocb *req = pt->req; unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling * (e.g. one for read, one for write). Setup a separate io_poll * if this happens. */ if (unlikely(pt->nr_entries)) { struct io_poll *first = poll; /* double add on the same waitqueue head, ignore */ if (first->head == head) return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { if ((*poll_ptr)->head == head) return; pt->error = -EINVAL; return; } poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; } /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; io_init_poll_iocb(poll, first->events); if (!io_poll_double_prepare(req)) { /* the request is completing, just back off */ kfree(poll); return; } *poll_ptr = poll; } else { /* fine to modify, there is no poll queued to race with us */ req->flags |= REQ_F_SINGLE_POLL; } pt->nr_entries++; poll->head = head; poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) { add_wait_queue_exclusive(head, &poll->wait); } else { add_wait_queue(head, &poll->wait); } } static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll); __io_queue_proc(poll, pt, head, (struct io_poll **) &pt->req->async_data); } static bool io_poll_can_finish_inline(struct io_kiocb *req, struct io_poll_table *pt) { return pt->owning || io_poll_get_ownership(req); } static void io_poll_add_hash(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; io_ring_submit_lock(ctx, issue_flags); io_poll_req_insert(req); io_ring_submit_unlock(ctx, issue_flags); } /* * Returns 0 when it's handed over for polling. The caller owns the requests if * it returns non-zero, but otherwise should not touch it. Negative values * contain an error code. When the result is >0, the polling has completed * inline and ipt.result_mask is set to the mask. */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask, unsigned issue_flags) { INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; ipt->pt._key = mask; ipt->req = req; ipt->error = 0; ipt->nr_entries = 0; /* * Polling is either completed here or via task_work, so if we're in the * task context we're naturally serialised with tw by merit of running * the same task. When it's io-wq, take the ownership to prevent tw * from running. However, when we're in the task context, skip taking * it as an optimisation. * * Note: even though the request won't be completed/freed, without * ownership we still can race with io_poll_wake(). * io_poll_can_finish_inline() tries to deal with that. */ ipt->owning = issue_flags & IO_URING_F_UNLOCKED; atomic_set(&req->poll_refs, (int)ipt->owning); /* * Exclusive waits may only wake a limited amount of entries * rather than all of them, this may interfere with lazy * wake if someone does wait(events > 1). Ensure we don't do * lazy wake for those, as we need to process each one as they * come in. */ if (poll->events & EPOLLEXCLUSIVE) req->flags |= REQ_F_POLL_NO_LAZY; mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); if (!io_poll_can_finish_inline(req, ipt)) { io_poll_mark_cancelled(req); return 0; } else if (mask && (poll->events & EPOLLET)) { ipt->result_mask = mask; return 1; } return ipt->error ?: -EINVAL; } if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { if (!io_poll_can_finish_inline(req, ipt)) { io_poll_add_hash(req, issue_flags); return 0; } io_poll_remove_entries(req); ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ return 1; } io_poll_add_hash(req, issue_flags); if (mask && (poll->events & EPOLLET) && io_poll_can_finish_inline(req, ipt)) { __io_poll_execute(req, mask); return 0; } io_napi_add(req); if (ipt->owning) { /* * Try to release ownership. If we see a change of state, e.g. * poll was waken up, queue up a tw, it'll deal with it. */ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) __io_poll_execute(req, 0); } return 0; } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); struct async_poll *apoll = pt->req->apoll; __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } /* * We can't reliably detect loops in repeated poll triggers and issue * subsequently failing. But rather than fail these immediately, allow a * certain amount of retries before we give up. Given that this condition * should _rarely_ trigger even once, we should be fine with a larger value. */ #define APOLL_MAX_RETRY 128 static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else { if (!(issue_flags & IO_URING_F_UNLOCKED)) apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC); else apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } apoll->double_poll = NULL; req->apoll = apoll; if (unlikely(!--apoll->poll.retries)) return NULL; return apoll; } int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) { struct async_poll *apoll; struct io_poll_table ipt; int ret; mask |= EPOLLET; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL); req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; __poll_t mask = POLLPRI | POLLERR; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (def->pollin) { mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if (req->flags & REQ_F_CLEAR_POLLIN) mask &= ~EPOLLIN; } else { mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; return io_arm_apoll(req, issue_flags, mask); } /* * Returns true if we found and killed one or more poll requests */ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; lockdep_assert_held(&ctx->uring_lock); for (i = 0; i < nr_buckets; i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { if (io_match_task_safe(req, tctx, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } } return found; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd) { struct io_kiocb *req; u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits); struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index]; hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { if (io_cancel_match_sequence(req, cd->seq)) continue; } return req; } return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct io_kiocb *req; int i; for (i = 0; i < nr_buckets; i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; hlist_for_each_entry(req, &hb->list, hash_node) { if (io_cancel_req_match(req, cd)) return req; } } return NULL; } static int io_poll_disarm(struct io_kiocb *req) { if (!req) return -ENOENT; if (!io_poll_get_ownership(req)) return -EALREADY; io_poll_remove_entries(req); hash_del(&req->hash_node); return 0; } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd); else req = io_poll_find(ctx, false, cd); if (req) { io_poll_cancel_req(req); return 0; } return -ENOENT; } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags) { int ret; io_ring_submit_lock(ctx, issue_flags); ret = __io_poll_cancel(ctx, cd); io_ring_submit_unlock(ctx, issue_flags); return ret; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, unsigned int flags) { u32 events; events = READ_ONCE(sqe->poll32_events); #ifdef __BIG_ENDIAN events = swahw32(events); #endif if (!(flags & IORING_POLL_ADD_MULTI)) events |= EPOLLONESHOT; if (!(flags & IORING_POLL_ADD_LEVEL)) events |= EPOLLET; return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET)); } int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update); u32 flags; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | IORING_POLL_ADD_MULTI)) return -EINVAL; /* meaningless without update */ if (flags == IORING_POLL_ADD_MULTI) return -EINVAL; upd->old_user_data = READ_ONCE(sqe->addr); upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; upd->new_user_data = READ_ONCE(sqe->off); if (!upd->update_user_data && upd->new_user_data) return -EINVAL; if (upd->update_events) upd->events = io_poll_parse_events(sqe, flags); else if (sqe->poll32_events) return -EINVAL; return 0; } int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); u32 flags; if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; poll->events = io_poll_parse_events(sqe, flags); return 0; } int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); struct io_poll_table ipt; int ret; ipt.pt._qproc = io_poll_queue_proc; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_COMPLETE; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_ring_ctx *ctx = req->ctx; struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; struct io_kiocb *preq; int ret2, ret = 0; io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd); ret2 = io_poll_disarm(preq); if (ret2) { ret = ret2; goto out; } if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { ret = -EFAULT; goto out; } if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll); poll->events &= ~0xffff; poll->events |= poll_update->events & 0xffff; poll->events |= IO_POLL_UNMASK; } if (poll_update->update_user_data) preq->cqe.user_data = poll_update->new_user_data; ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ if (!ret2 || ret2 == -EIOCBQUEUED) goto out; } req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); preq->io_task_work.func = io_req_task_complete; io_req_task_work_add(preq); out: io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { req_set_fail(req); return ret; } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); return IOU_COMPLETE; }
19 19 19 19 18 19 19 19 19 19 19 19 19 19 19 19 19 18 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 18 19 19 19 19 19 19 19 19 1 19 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 17 18 17 18 18 18 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 // SPDX-License-Identifier: GPL-2.0-only #include <linux/dim.h> #include "netlink.h" #include "common.h" struct coalesce_req_info { struct ethnl_req_info base; }; struct coalesce_reply_data { struct ethnl_reply_data base; struct ethtool_coalesce coalesce; struct kernel_ethtool_coalesce kernel_coalesce; u32 supported_params; }; #define COALESCE_REPDATA(__reply_base) \ container_of(__reply_base, struct coalesce_reply_data, base) #define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS static u32 attr_to_mask(unsigned int attr_type) { return BIT(attr_type - __SUPPORTED_OFFSET); } /* build time check that indices in ethtool_ops::supported_coalesce_params * match corresponding attribute types with an offset */ #define __CHECK_SUPPORTED_OFFSET(x) \ static_assert((ETHTOOL_ ## x) == \ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET)) __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX); __CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX); __CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL); const struct nla_policy ethnl_coalesce_get_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; data->supported_params = dev->ethtool_ops->supported_coalesce_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, &data->kernel_coalesce, info->extack); ethnl_ops_complete(dev); return ret; } static int coalesce_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */ nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */ int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */ modersz * NET_DIM_PARAMS_NUM_PROFILES; return nla_total_size(sizeof(u32)) + /* _RX_USECS */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */ nla_total_size(sizeof(u32)) + /* _TX_USECS */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */ total_modersz * 2; /* _{R,T}X_PROFILE */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, u32 supported_params) { if (!val && !(supported_params & attr_to_mask(attr_type))) return false; return nla_put_u32(skb, attr_type, val); } static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val, u32 supported_params) { if (!val && !(supported_params & attr_to_mask(attr_type))) return false; return nla_put_u8(skb, attr_type, !!val); } /** * coalesce_put_profile - fill reply with a nla nest with four child nla nests. * @skb: socket buffer the message is stored in * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE * @profile: data passed to userspace * @coal_flags: modifiable parameters supported by the driver * * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION. * * Return: 0 on success or a negative error code. */ static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type, const struct dim_cq_moder *profile, u8 coal_flags) { struct nlattr *profile_attr, *moder_attr; int i, ret; if (!profile || !coal_flags) return 0; profile_attr = nla_nest_start(skb, attr_type); if (!profile_attr) return -EMSGSIZE; for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { moder_attr = nla_nest_start(skb, ETHTOOL_A_PROFILE_IRQ_MODERATION); if (!moder_attr) { ret = -EMSGSIZE; goto cancel_profile; } if (coal_flags & DIM_COALESCE_USEC) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC, profile[i].usec); if (ret) goto cancel_moder; } if (coal_flags & DIM_COALESCE_PKTS) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS, profile[i].pkts); if (ret) goto cancel_moder; } if (coal_flags & DIM_COALESCE_COMPS) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS, profile[i].comps); if (ret) goto cancel_moder; } nla_nest_end(skb, moder_attr); } nla_nest_end(skb, profile_attr); return 0; cancel_moder: nla_nest_cancel(skb, moder_attr); cancel_profile: nla_nest_cancel(skb, profile_attr); return ret; } static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; struct dim_irq_moder *moder; int ret = 0; if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS, coal->rx_coalesce_usecs, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES, coal->rx_max_coalesced_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ, coal->rx_coalesce_usecs_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, coal->rx_max_coalesced_frames_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS, coal->tx_coalesce_usecs, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES, coal->tx_max_coalesced_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ, coal->tx_coalesce_usecs_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, coal->tx_max_coalesced_frames_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, coal->stats_block_coalesce_usecs, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, coal->use_adaptive_rx_coalesce, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, coal->use_adaptive_tx_coalesce, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW, coal->pkt_rate_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW, coal->rx_coalesce_usecs_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, coal->rx_max_coalesced_frames_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW, coal->tx_coalesce_usecs_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, coal->tx_max_coalesced_frames_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH, coal->pkt_rate_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH, coal->rx_coalesce_usecs_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, coal->rx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH, coal->tx_coalesce_usecs_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, coal->tx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, coal->rate_sample_interval, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, kcoal->use_cqe_mode_tx, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, kcoal->use_cqe_mode_rx, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, kcoal->tx_aggr_max_bytes, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, kcoal->tx_aggr_max_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; if (!req_base->dev || !req_base->dev->irq_moder) return 0; moder = req_base->dev->irq_moder; rcu_read_lock(); if (moder->profile_flags & DIM_PROFILE_RX) { ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE, rcu_dereference(moder->rx_profile), moder->coal_flags); if (ret) goto out; } if (moder->profile_flags & DIM_PROFILE_TX) ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE, rcu_dereference(moder->tx_profile), moder->coal_flags); out: rcu_read_unlock(); return ret; } /* COALESCE_SET */ static const struct nla_policy coalesce_irq_moderation_policy[] = { [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 }, [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 }, [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 }, }; static const struct nla_policy coalesce_profile_policy[] = { [ETHTOOL_A_PROFILE_IRQ_MODERATION] = NLA_POLICY_NESTED(coalesce_irq_moderation_policy), }; const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 }, [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), [ETHTOOL_A_COALESCE_TX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), }; static int ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct dim_irq_moder *irq_moder = req_info->dev->irq_moder; struct nlattr **tb = info->attrs; u32 supported_params; u16 a; if (!ops->get_coalesce || !ops->set_coalesce) return -EOPNOTSUPP; /* make sure that only supported parameters are present */ supported_params = ops->supported_coalesce_params; if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX) supported_params |= ETHTOOL_COALESCE_RX_PROFILE; if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX) supported_params |= ETHTOOL_COALESCE_TX_PROFILE; for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { NL_SET_ERR_MSG_ATTR(info->extack, tb[a], "cannot modify an unsupported parameter"); return -EINVAL; } return 1; } /** * ethnl_update_irq_moder - update a specific field in the given profile * @irq_moder: place that collects dim related information * @irq_field: field in profile to modify * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_* * @tb: netlink attribute with new values or null * @coal_bit: DIM_COALESCE_* bit from coal_flags * @mod: pointer to bool for modification tracking * @extack: netlink extended ack * * Return: 0 on success or a negative error code. */ static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder, u16 *irq_field, u16 attr_type, struct nlattr **tb, u8 coal_bit, bool *mod, struct netlink_ext_ack *extack) { int ret = 0; u32 val; if (!tb[attr_type]) return 0; if (irq_moder->coal_flags & coal_bit) { val = nla_get_u32(tb[attr_type]); if (*irq_field == val) return 0; *irq_field = val; *mod = true; } else { NL_SET_BAD_ATTR(extack, tb[attr_type]); ret = -EOPNOTSUPP; } return ret; } /** * ethnl_update_profile - get a profile nest with child nests from userspace. * @dev: netdevice to update the profile * @dst: profile get from the driver and modified by ethnl_update_profile. * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile. * @mod: pointer to bool for modification tracking * @extack: Netlink extended ack * * Layout of nests: * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr * ETHTOOL_A_IRQ_MODERATION_USEC attr * ETHTOOL_A_IRQ_MODERATION_PKTS attr * ETHTOOL_A_IRQ_MODERATION_COMPS attr * ... * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr * ETHTOOL_A_IRQ_MODERATION_USEC attr * ETHTOOL_A_IRQ_MODERATION_PKTS attr * ETHTOOL_A_IRQ_MODERATION_COMPS attr * * Return: 0 on success or a negative error code. */ static int ethnl_update_profile(struct net_device *dev, struct dim_cq_moder __rcu **dst, const struct nlattr *nests, bool *mod, struct netlink_ext_ack *extack) { int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy); struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)]; struct dim_irq_moder *irq_moder = dev->irq_moder; struct dim_cq_moder *new_profile, *old_profile; int ret, rem, i = 0, len; struct nlattr *nest; if (!nests) return 0; if (!*dst) return -EOPNOTSUPP; old_profile = rtnl_dereference(*dst); len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile); new_profile = kmemdup(old_profile, len, GFP_KERNEL); if (!new_profile) return -ENOMEM; nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION, nests, rem) { ret = nla_parse_nested(tb, len_irq_moder - 1, nest, coalesce_irq_moderation_policy, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec, ETHTOOL_A_IRQ_MODERATION_USEC, tb, DIM_COALESCE_USEC, mod, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts, ETHTOOL_A_IRQ_MODERATION_PKTS, tb, DIM_COALESCE_PKTS, mod, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps, ETHTOOL_A_IRQ_MODERATION_COMPS, tb, DIM_COALESCE_COMPS, mod, extack); if (ret) goto err_out; i++; } /* After the profile is modified, dim itself is a dynamic * mechanism and will quickly fit to the appropriate * coalescing parameters according to the new profile. */ rcu_assign_pointer(*dst, new_profile); kfree_rcu(old_profile, rcu); return 0; err_out: kfree(new_profile); return ret; } static int __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, bool *dual_change) { struct kernel_ethtool_coalesce kernel_coalesce = {}; struct net_device *dev = req_info->dev; struct ethtool_coalesce coalesce = {}; bool mod_mode = false, mod = false; struct nlattr **tb = info->attrs; int ret; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); if (ret < 0) return ret; /* Update values */ ethnl_update_u32(&coalesce.rx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_RX_USECS], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq, tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_TX_USECS], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq, tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod); ethnl_update_u32(&coalesce.stats_block_coalesce_usecs, tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod); ethnl_update_u32(&coalesce.pkt_rate_low, tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_low, tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_low, tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod); ethnl_update_u32(&coalesce.pkt_rate_high, tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_high, tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_high, tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.rate_sample_interval, tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) { ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile, tb[ETHTOOL_A_COALESCE_RX_PROFILE], &mod, info->extack); if (ret < 0) return ret; } if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_TX) { ret = ethnl_update_profile(dev, &dev->irq_moder->tx_profile, tb[ETHTOOL_A_COALESCE_TX_PROFILE], &mod, info->extack); if (ret < 0) return ret; } /* Update operation modes */ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod_mode); ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod_mode); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod_mode); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod_mode); *dual_change = mod && mod_mode; if (!mod && !mod_mode) return 0; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); return ret < 0 ? ret : 1; } static int ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info) { bool dual_change; int err, ret; /* SET_COALESCE may change operation mode and parameters in one call. * Changing operation mode may cause the driver to reset the parameter * values, and therefore ignore user input (driver does not know which * parameters come from user and which are echoed back from ->get). * To not complicate the drivers if user tries to change both the mode * and parameters at once - call the driver twice. */ err = __ethnl_set_coalesce(req_info, info, &dual_change); if (err < 0) return err; ret = err; if (ret && dual_change) { err = __ethnl_set_coalesce(req_info, info, &dual_change); if (err < 0) return err; } return ret; } const struct ethnl_request_ops ethnl_coalesce_request_ops = { .request_cmd = ETHTOOL_MSG_COALESCE_GET, .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, .hdr_attr = ETHTOOL_A_COALESCE_HEADER, .req_info_size = sizeof(struct coalesce_req_info), .reply_data_size = sizeof(struct coalesce_reply_data), .prepare_data = coalesce_prepare_data, .reply_size = coalesce_reply_size, .fill_reply = coalesce_fill_reply, .set_validate = ethnl_set_coalesce_validate, .set = ethnl_set_coalesce, .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF, };
3 3 1 1 3 3 3 3 3 3 3 3 1 1 1 1 25 26 27 12 12 12 3 3 9 9 4 4 4 4 5 5 5 5 6 6 6 3 3 3 3 3 9 4 5 3 3 10 3 4 5 1 3 3 3 1 30 15 67 1 1 1 1 67 1 1 1 1 9 9 1 1 1 1 1 1 1 1 4 1 3 5 1 1 1 1 1 1 1 1 9 9 9 9 9 1 1 1 1 1 2 1 2 2 2 2 2 4 4 4 4 5 5 5 5 1 1 1 1 2 2 2 2 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 68 67 68 30 30 29 39 67 68 68 1 1 1 1 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 1 1 19 19 19 19 19 7 6 7 7 7 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 // SPDX-License-Identifier: GPL-2.0-only /* * V4L2 sub-device * * Copyright (C) 2010 Nokia Corporation * * Contact: Laurent Pinchart <laurent.pinchart@ideasonboard.com> * Sakari Ailus <sakari.ailus@iki.fi> */ #include <linux/export.h> #include <linux/ioctl.h> #include <linux/leds.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/overflow.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> #include <linux/version.h> #include <linux/videodev2.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-device.h> #include <media/v4l2-event.h> #include <media/v4l2-fh.h> #include <media/v4l2-ioctl.h> /** * struct v4l2_subdev_stream_config - Used for storing stream configuration. * * @pad: pad number * @stream: stream number * @enabled: has the stream been enabled with v4l2_subdev_enable_streams() * @fmt: &struct v4l2_mbus_framefmt * @crop: &struct v4l2_rect to be used for crop * @compose: &struct v4l2_rect to be used for compose * @interval: frame interval * * This structure stores configuration for a stream. */ struct v4l2_subdev_stream_config { u32 pad; u32 stream; bool enabled; struct v4l2_mbus_framefmt fmt; struct v4l2_rect crop; struct v4l2_rect compose; struct v4l2_fract interval; }; #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) /* * The Streams API is an experimental feature. To use the Streams API, set * 'v4l2_subdev_enable_streams_api' to 1 below. */ static bool v4l2_subdev_enable_streams_api; #endif /* * Maximum stream ID is 63 for now, as we use u64 bitmask to represent a set * of streams. * * Note that V4L2_FRAME_DESC_ENTRY_MAX is related: V4L2_FRAME_DESC_ENTRY_MAX * restricts the total number of streams in a pad, although the stream ID is * not restricted. */ #define V4L2_SUBDEV_MAX_STREAM_ID 63 #include "v4l2-subdev-priv.h" #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static int subdev_fh_init(struct v4l2_subdev_fh *fh, struct v4l2_subdev *sd) { struct v4l2_subdev_state *state; static struct lock_class_key key; state = __v4l2_subdev_state_alloc(sd, "fh->state->lock", &key); if (IS_ERR(state)) return PTR_ERR(state); fh->state = state; return 0; } static void subdev_fh_free(struct v4l2_subdev_fh *fh) { __v4l2_subdev_state_free(fh->state); fh->state = NULL; } static int subdev_open(struct file *file) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_subdev_fh *subdev_fh; int ret; subdev_fh = kzalloc(sizeof(*subdev_fh), GFP_KERNEL); if (subdev_fh == NULL) return -ENOMEM; ret = subdev_fh_init(subdev_fh, sd); if (ret) { kfree(subdev_fh); return ret; } v4l2_fh_init(&subdev_fh->vfh, vdev); v4l2_fh_add(&subdev_fh->vfh, file); if (sd->v4l2_dev->mdev && sd->entity.graph_obj.mdev->dev) { struct module *owner; owner = sd->entity.graph_obj.mdev->dev->driver->owner; if (!try_module_get(owner)) { ret = -EBUSY; goto err; } subdev_fh->owner = owner; } if (sd->internal_ops && sd->internal_ops->open) { ret = sd->internal_ops->open(sd, subdev_fh); if (ret < 0) goto err; } return 0; err: module_put(subdev_fh->owner); v4l2_fh_del(&subdev_fh->vfh, file); v4l2_fh_exit(&subdev_fh->vfh); subdev_fh_free(subdev_fh); kfree(subdev_fh); return ret; } static int subdev_close(struct file *file) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); if (sd->internal_ops && sd->internal_ops->close) sd->internal_ops->close(sd, subdev_fh); module_put(subdev_fh->owner); v4l2_fh_del(vfh, file); v4l2_fh_exit(vfh); subdev_fh_free(subdev_fh); kfree(subdev_fh); return 0; } #else /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static int subdev_open(struct file *file) { return -ENODEV; } static int subdev_close(struct file *file) { return -ENODEV; } #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static void v4l2_subdev_enable_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) if (!IS_ERR_OR_NULL(sd->privacy_led)) led_set_brightness(sd->privacy_led, sd->privacy_led->max_brightness); #endif } static void v4l2_subdev_disable_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) if (!IS_ERR_OR_NULL(sd->privacy_led)) led_set_brightness(sd->privacy_led, 0); #endif } static inline int check_which(u32 which) { if (which != V4L2_SUBDEV_FORMAT_TRY && which != V4L2_SUBDEV_FORMAT_ACTIVE) return -EINVAL; return 0; } static inline int check_pad(struct v4l2_subdev *sd, u32 pad) { #if defined(CONFIG_MEDIA_CONTROLLER) if (sd->entity.num_pads) { if (pad >= sd->entity.num_pads) return -EINVAL; return 0; } #endif /* allow pad 0 on subdevices not registered as media entities */ if (pad > 0) return -EINVAL; return 0; } static int check_state(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 which, u32 pad, u32 stream) { if (sd->flags & V4L2_SUBDEV_FL_STREAMS) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) if (!v4l2_subdev_state_get_format(state, pad, stream)) return -EINVAL; return 0; #else return -EINVAL; #endif } if (stream != 0) return -EINVAL; if (which == V4L2_SUBDEV_FORMAT_TRY && (!state || !state->pads)) return -EINVAL; return 0; } static inline int check_format(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { if (!format) return -EINVAL; return check_which(format->which) ? : check_pad(sd, format->pad) ? : check_state(sd, state, format->which, format->pad, format->stream); } static int call_get_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { return check_format(sd, state, format) ? : sd->ops->pad->get_fmt(sd, state, format); } static int call_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { return check_format(sd, state, format) ? : sd->ops->pad->set_fmt(sd, state, format); } static int call_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_mbus_code_enum *code) { if (!code) return -EINVAL; return check_which(code->which) ? : check_pad(sd, code->pad) ? : check_state(sd, state, code->which, code->pad, code->stream) ? : sd->ops->pad->enum_mbus_code(sd, state, code); } static int call_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_size_enum *fse) { if (!fse) return -EINVAL; return check_which(fse->which) ? : check_pad(sd, fse->pad) ? : check_state(sd, state, fse->which, fse->pad, fse->stream) ? : sd->ops->pad->enum_frame_size(sd, state, fse); } static int call_enum_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval_enum *fie) { if (!fie) return -EINVAL; return check_which(fie->which) ? : check_pad(sd, fie->pad) ? : check_state(sd, state, fie->which, fie->pad, fie->stream) ? : sd->ops->pad->enum_frame_interval(sd, state, fie); } static inline int check_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel) { if (!sel) return -EINVAL; return check_which(sel->which) ? : check_pad(sd, sel->pad) ? : check_state(sd, state, sel->which, sel->pad, sel->stream); } static int call_get_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel) { return check_selection(sd, state, sel) ? : sd->ops->pad->get_selection(sd, state, sel); } static int call_set_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel) { return check_selection(sd, state, sel) ? : sd->ops->pad->set_selection(sd, state, sel); } static inline int check_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { if (!fi) return -EINVAL; return check_which(fi->which) ? : check_pad(sd, fi->pad) ? : check_state(sd, state, fi->which, fi->pad, fi->stream); } static int call_get_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { return check_frame_interval(sd, state, fi) ? : sd->ops->pad->get_frame_interval(sd, state, fi); } static int call_set_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { return check_frame_interval(sd, state, fi) ? : sd->ops->pad->set_frame_interval(sd, state, fi); } static int call_get_frame_desc(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_frame_desc *fd) { unsigned int i; int ret; #if defined(CONFIG_MEDIA_CONTROLLER) if (!(sd->entity.pads[pad].flags & MEDIA_PAD_FL_SOURCE)) return -EOPNOTSUPP; #endif memset(fd, 0, sizeof(*fd)); ret = sd->ops->pad->get_frame_desc(sd, pad, fd); if (ret) return ret; dev_dbg(sd->dev, "Frame descriptor on pad %u, type %s\n", pad, fd->type == V4L2_MBUS_FRAME_DESC_TYPE_PARALLEL ? "parallel" : fd->type == V4L2_MBUS_FRAME_DESC_TYPE_CSI2 ? "CSI-2" : "unknown"); for (i = 0; i < fd->num_entries; i++) { struct v4l2_mbus_frame_desc_entry *entry = &fd->entry[i]; char buf[20] = ""; if (fd->type == V4L2_MBUS_FRAME_DESC_TYPE_CSI2) WARN_ON(snprintf(buf, sizeof(buf), ", vc %u, dt 0x%02x", entry->bus.csi2.vc, entry->bus.csi2.dt) >= sizeof(buf)); dev_dbg(sd->dev, "\tstream %u, code 0x%04x, length %u, flags 0x%04x%s\n", entry->stream, entry->pixelcode, entry->length, entry->flags, buf); } return 0; } static inline int check_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid) { if (!edid) return -EINVAL; if (edid->blocks && edid->edid == NULL) return -EINVAL; return check_pad(sd, edid->pad); } static int call_get_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid) { return check_edid(sd, edid) ? : sd->ops->pad->get_edid(sd, edid); } static int call_set_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid) { return check_edid(sd, edid) ? : sd->ops->pad->set_edid(sd, edid); } static int call_s_dv_timings(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_dv_timings *timings) { if (!timings) return -EINVAL; return check_pad(sd, pad) ? : sd->ops->pad->s_dv_timings(sd, pad, timings); } static int call_g_dv_timings(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_dv_timings *timings) { if (!timings) return -EINVAL; return check_pad(sd, pad) ? : sd->ops->pad->g_dv_timings(sd, pad, timings); } static int call_query_dv_timings(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_dv_timings *timings) { if (!timings) return -EINVAL; return check_pad(sd, pad) ? : sd->ops->pad->query_dv_timings(sd, pad, timings); } static int call_dv_timings_cap(struct v4l2_subdev *sd, struct v4l2_dv_timings_cap *cap) { if (!cap) return -EINVAL; return check_pad(sd, cap->pad) ? : sd->ops->pad->dv_timings_cap(sd, cap); } static int call_enum_dv_timings(struct v4l2_subdev *sd, struct v4l2_enum_dv_timings *dvt) { if (!dvt) return -EINVAL; return check_pad(sd, dvt->pad) ? : sd->ops->pad->enum_dv_timings(sd, dvt); } static int call_get_mbus_config(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_config *config) { memset(config, 0, sizeof(*config)); return check_pad(sd, pad) ? : sd->ops->pad->get_mbus_config(sd, pad, config); } static int call_s_stream(struct v4l2_subdev *sd, int enable) { int ret; /* * The .s_stream() operation must never be called to start or stop an * already started or stopped subdev. Catch offenders but don't return * an error yet to avoid regressions. */ if (WARN_ON(sd->s_stream_enabled == !!enable)) return 0; ret = sd->ops->video->s_stream(sd, enable); if (!enable && ret < 0) { dev_warn(sd->dev, "disabling streaming failed (%d)\n", ret); ret = 0; } if (!ret) { sd->s_stream_enabled = enable; if (enable) v4l2_subdev_enable_privacy_led(sd); else v4l2_subdev_disable_privacy_led(sd); } return ret; } #ifdef CONFIG_MEDIA_CONTROLLER /* * Create state-management wrapper for pad ops dealing with subdev state. The * wrapper handles the case where the caller does not provide the called * subdev's state. This should be removed when all the callers are fixed. */ #define DEFINE_STATE_WRAPPER(f, arg_type) \ static int call_##f##_state(struct v4l2_subdev *sd, \ struct v4l2_subdev_state *_state, \ arg_type *arg) \ { \ struct v4l2_subdev_state *state = _state; \ int ret; \ if (!_state) \ state = v4l2_subdev_lock_and_get_active_state(sd); \ ret = call_##f(sd, state, arg); \ if (!_state && state) \ v4l2_subdev_unlock_state(state); \ return ret; \ } #else /* CONFIG_MEDIA_CONTROLLER */ #define DEFINE_STATE_WRAPPER(f, arg_type) \ static int call_##f##_state(struct v4l2_subdev *sd, \ struct v4l2_subdev_state *state, \ arg_type *arg) \ { \ return call_##f(sd, state, arg); \ } #endif /* CONFIG_MEDIA_CONTROLLER */ DEFINE_STATE_WRAPPER(get_fmt, struct v4l2_subdev_format); DEFINE_STATE_WRAPPER(set_fmt, struct v4l2_subdev_format); DEFINE_STATE_WRAPPER(enum_mbus_code, struct v4l2_subdev_mbus_code_enum); DEFINE_STATE_WRAPPER(enum_frame_size, struct v4l2_subdev_frame_size_enum); DEFINE_STATE_WRAPPER(enum_frame_interval, struct v4l2_subdev_frame_interval_enum); DEFINE_STATE_WRAPPER(get_selection, struct v4l2_subdev_selection); DEFINE_STATE_WRAPPER(set_selection, struct v4l2_subdev_selection); static const struct v4l2_subdev_pad_ops v4l2_subdev_call_pad_wrappers = { .get_fmt = call_get_fmt_state, .set_fmt = call_set_fmt_state, .enum_mbus_code = call_enum_mbus_code_state, .enum_frame_size = call_enum_frame_size_state, .enum_frame_interval = call_enum_frame_interval_state, .get_selection = call_get_selection_state, .set_selection = call_set_selection_state, .get_frame_interval = call_get_frame_interval, .set_frame_interval = call_set_frame_interval, .get_edid = call_get_edid, .set_edid = call_set_edid, .s_dv_timings = call_s_dv_timings, .g_dv_timings = call_g_dv_timings, .query_dv_timings = call_query_dv_timings, .dv_timings_cap = call_dv_timings_cap, .enum_dv_timings = call_enum_dv_timings, .get_frame_desc = call_get_frame_desc, .get_mbus_config = call_get_mbus_config, }; static const struct v4l2_subdev_video_ops v4l2_subdev_call_video_wrappers = { .s_stream = call_s_stream, }; const struct v4l2_subdev_ops v4l2_subdev_call_wrappers = { .pad = &v4l2_subdev_call_pad_wrappers, .video = &v4l2_subdev_call_video_wrappers, }; EXPORT_SYMBOL(v4l2_subdev_call_wrappers); #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static struct v4l2_subdev_state * subdev_ioctl_get_state(struct v4l2_subdev *sd, struct v4l2_subdev_fh *subdev_fh, unsigned int cmd, void *arg) { u32 which; switch (cmd) { default: return NULL; case VIDIOC_SUBDEV_G_FMT: case VIDIOC_SUBDEV_S_FMT: which = ((struct v4l2_subdev_format *)arg)->which; break; case VIDIOC_SUBDEV_G_CROP: case VIDIOC_SUBDEV_S_CROP: which = ((struct v4l2_subdev_crop *)arg)->which; break; case VIDIOC_SUBDEV_ENUM_MBUS_CODE: which = ((struct v4l2_subdev_mbus_code_enum *)arg)->which; break; case VIDIOC_SUBDEV_ENUM_FRAME_SIZE: which = ((struct v4l2_subdev_frame_size_enum *)arg)->which; break; case VIDIOC_SUBDEV_ENUM_FRAME_INTERVAL: which = ((struct v4l2_subdev_frame_interval_enum *)arg)->which; break; case VIDIOC_SUBDEV_G_SELECTION: case VIDIOC_SUBDEV_S_SELECTION: which = ((struct v4l2_subdev_selection *)arg)->which; break; case VIDIOC_SUBDEV_G_FRAME_INTERVAL: case VIDIOC_SUBDEV_S_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval *fi = arg; if (!(subdev_fh->client_caps & V4L2_SUBDEV_CLIENT_CAP_INTERVAL_USES_WHICH)) fi->which = V4L2_SUBDEV_FORMAT_ACTIVE; which = fi->which; break; } case VIDIOC_SUBDEV_G_ROUTING: case VIDIOC_SUBDEV_S_ROUTING: which = ((struct v4l2_subdev_routing *)arg)->which; break; } return which == V4L2_SUBDEV_FORMAT_TRY ? subdev_fh->state : v4l2_subdev_get_unlocked_active_state(sd); } static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg, struct v4l2_subdev_state *state) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); bool ro_subdev = test_bit(V4L2_FL_SUBDEV_RO_DEVNODE, &vdev->flags); bool streams_subdev = sd->flags & V4L2_SUBDEV_FL_STREAMS; bool client_supports_streams = subdev_fh->client_caps & V4L2_SUBDEV_CLIENT_CAP_STREAMS; int rval; /* * If the streams API is not enabled, remove V4L2_SUBDEV_CAP_STREAMS. * Remove this when the API is no longer experimental. */ if (!v4l2_subdev_enable_streams_api) streams_subdev = false; switch (cmd) { case VIDIOC_SUBDEV_QUERYCAP: { struct v4l2_subdev_capability *cap = arg; memset(cap->reserved, 0, sizeof(cap->reserved)); cap->version = LINUX_VERSION_CODE; cap->capabilities = (ro_subdev ? V4L2_SUBDEV_CAP_RO_SUBDEV : 0) | (streams_subdev ? V4L2_SUBDEV_CAP_STREAMS : 0); return 0; } case VIDIOC_QUERYCTRL: /* * TODO: this really should be folded into v4l2_queryctrl (this * currently returns -EINVAL for NULL control handlers). * However, v4l2_queryctrl() is still called directly by * drivers as well and until that has been addressed I believe * it is safer to do the check here. The same is true for the * other control ioctls below. */ if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_queryctrl(vfh->ctrl_handler, arg); case VIDIOC_QUERY_EXT_CTRL: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_query_ext_ctrl(vfh->ctrl_handler, arg); case VIDIOC_QUERYMENU: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_querymenu(vfh->ctrl_handler, arg); case VIDIOC_G_CTRL: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_g_ctrl(vfh->ctrl_handler, arg); case VIDIOC_S_CTRL: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_s_ctrl(vfh, vfh->ctrl_handler, arg); case VIDIOC_G_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_g_ext_ctrls(vfh->ctrl_handler, vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_S_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_s_ext_ctrls(vfh, vfh->ctrl_handler, vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_TRY_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_try_ext_ctrls(vfh->ctrl_handler, vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_DQEVENT: if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) return -ENOIOCTLCMD; return v4l2_event_dequeue(vfh, arg, file->f_flags & O_NONBLOCK); case VIDIOC_SUBSCRIBE_EVENT: if (v4l2_subdev_has_op(sd, core, subscribe_event)) return v4l2_subdev_call(sd, core, subscribe_event, vfh, arg); if ((sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS) && vfh->ctrl_handler) return v4l2_ctrl_subdev_subscribe_event(sd, vfh, arg); return -ENOIOCTLCMD; case VIDIOC_UNSUBSCRIBE_EVENT: if (v4l2_subdev_has_op(sd, core, unsubscribe_event)) return v4l2_subdev_call(sd, core, unsubscribe_event, vfh, arg); if (sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS) return v4l2_event_subdev_unsubscribe(sd, vfh, arg); return -ENOIOCTLCMD; #ifdef CONFIG_VIDEO_ADV_DEBUG case VIDIOC_DBG_G_REGISTER: { struct v4l2_dbg_register *p = arg; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return v4l2_subdev_call(sd, core, g_register, p); } case VIDIOC_DBG_S_REGISTER: { struct v4l2_dbg_register *p = arg; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return v4l2_subdev_call(sd, core, s_register, p); } case VIDIOC_DBG_G_CHIP_INFO: { struct v4l2_dbg_chip_info *p = arg; if (p->match.type != V4L2_CHIP_MATCH_SUBDEV || p->match.addr) return -EINVAL; if (sd->ops->core && sd->ops->core->s_register) p->flags |= V4L2_CHIP_FL_WRITABLE; if (sd->ops->core && sd->ops->core->g_register) p->flags |= V4L2_CHIP_FL_READABLE; strscpy(p->name, sd->name, sizeof(p->name)); return 0; } #endif case VIDIOC_LOG_STATUS: { int ret; pr_info("%s: ================= START STATUS =================\n", sd->name); ret = v4l2_subdev_call(sd, core, log_status); pr_info("%s: ================== END STATUS ==================\n", sd->name); return ret; } case VIDIOC_SUBDEV_G_FMT: { struct v4l2_subdev_format *format = arg; if (!client_supports_streams) format->stream = 0; memset(format->reserved, 0, sizeof(format->reserved)); memset(format->format.reserved, 0, sizeof(format->format.reserved)); return v4l2_subdev_call(sd, pad, get_fmt, state, format); } case VIDIOC_SUBDEV_S_FMT: { struct v4l2_subdev_format *format = arg; if (format->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (!client_supports_streams) format->stream = 0; memset(format->reserved, 0, sizeof(format->reserved)); memset(format->format.reserved, 0, sizeof(format->format.reserved)); return v4l2_subdev_call(sd, pad, set_fmt, state, format); } case VIDIOC_SUBDEV_G_CROP: { struct v4l2_subdev_crop *crop = arg; struct v4l2_subdev_selection sel; if (!client_supports_streams) crop->stream = 0; memset(crop->reserved, 0, sizeof(crop->reserved)); memset(&sel, 0, sizeof(sel)); sel.which = crop->which; sel.pad = crop->pad; sel.stream = crop->stream; sel.target = V4L2_SEL_TGT_CROP; rval = v4l2_subdev_call( sd, pad, get_selection, state, &sel); crop->rect = sel.r; return rval; } case VIDIOC_SUBDEV_S_CROP: { struct v4l2_subdev_crop *crop = arg; struct v4l2_subdev_selection sel; if (crop->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (!client_supports_streams) crop->stream = 0; memset(crop->reserved, 0, sizeof(crop->reserved)); memset(&sel, 0, sizeof(sel)); sel.which = crop->which; sel.pad = crop->pad; sel.stream = crop->stream; sel.target = V4L2_SEL_TGT_CROP; sel.r = crop->rect; rval = v4l2_subdev_call( sd, pad, set_selection, state, &sel); crop->rect = sel.r; return rval; } case VIDIOC_SUBDEV_ENUM_MBUS_CODE: { struct v4l2_subdev_mbus_code_enum *code = arg; if (!client_supports_streams) code->stream = 0; memset(code->reserved, 0, sizeof(code->reserved)); return v4l2_subdev_call(sd, pad, enum_mbus_code, state, code); } case VIDIOC_SUBDEV_ENUM_FRAME_SIZE: { struct v4l2_subdev_frame_size_enum *fse = arg; if (!client_supports_streams) fse->stream = 0; memset(fse->reserved, 0, sizeof(fse->reserved)); return v4l2_subdev_call(sd, pad, enum_frame_size, state, fse); } case VIDIOC_SUBDEV_G_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval *fi = arg; if (!client_supports_streams) fi->stream = 0; memset(fi->reserved, 0, sizeof(fi->reserved)); return v4l2_subdev_call(sd, pad, get_frame_interval, state, fi); } case VIDIOC_SUBDEV_S_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval *fi = arg; if (!client_supports_streams) fi->stream = 0; if (fi->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; memset(fi->reserved, 0, sizeof(fi->reserved)); return v4l2_subdev_call(sd, pad, set_frame_interval, state, fi); } case VIDIOC_SUBDEV_ENUM_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval_enum *fie = arg; if (!client_supports_streams) fie->stream = 0; memset(fie->reserved, 0, sizeof(fie->reserved)); return v4l2_subdev_call(sd, pad, enum_frame_interval, state, fie); } case VIDIOC_SUBDEV_G_SELECTION: { struct v4l2_subdev_selection *sel = arg; if (!client_supports_streams) sel->stream = 0; memset(sel->reserved, 0, sizeof(sel->reserved)); return v4l2_subdev_call( sd, pad, get_selection, state, sel); } case VIDIOC_SUBDEV_S_SELECTION: { struct v4l2_subdev_selection *sel = arg; if (sel->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (!client_supports_streams) sel->stream = 0; memset(sel->reserved, 0, sizeof(sel->reserved)); return v4l2_subdev_call( sd, pad, set_selection, state, sel); } case VIDIOC_G_EDID: { struct v4l2_subdev_edid *edid = arg; return v4l2_subdev_call(sd, pad, get_edid, edid); } case VIDIOC_S_EDID: { struct v4l2_subdev_edid *edid = arg; return v4l2_subdev_call(sd, pad, set_edid, edid); } case VIDIOC_SUBDEV_DV_TIMINGS_CAP: { struct v4l2_dv_timings_cap *cap = arg; return v4l2_subdev_call(sd, pad, dv_timings_cap, cap); } case VIDIOC_SUBDEV_ENUM_DV_TIMINGS: { struct v4l2_enum_dv_timings *dvt = arg; return v4l2_subdev_call(sd, pad, enum_dv_timings, dvt); } case VIDIOC_SUBDEV_QUERY_DV_TIMINGS: return v4l2_subdev_call(sd, pad, query_dv_timings, 0, arg); case VIDIOC_SUBDEV_G_DV_TIMINGS: return v4l2_subdev_call(sd, pad, g_dv_timings, 0, arg); case VIDIOC_SUBDEV_S_DV_TIMINGS: if (ro_subdev) return -EPERM; return v4l2_subdev_call(sd, pad, s_dv_timings, 0, arg); case VIDIOC_SUBDEV_G_STD: return v4l2_subdev_call(sd, video, g_std, arg); case VIDIOC_SUBDEV_S_STD: { v4l2_std_id *std = arg; if (ro_subdev) return -EPERM; return v4l2_subdev_call(sd, video, s_std, *std); } case VIDIOC_SUBDEV_ENUMSTD: { struct v4l2_standard *p = arg; v4l2_std_id id; if (v4l2_subdev_call(sd, video, g_tvnorms, &id)) return -EINVAL; return v4l_video_std_enumstd(p, id); } case VIDIOC_SUBDEV_QUERYSTD: return v4l2_subdev_call(sd, video, querystd, arg); case VIDIOC_SUBDEV_G_ROUTING: { struct v4l2_subdev_routing *routing = arg; struct v4l2_subdev_krouting *krouting; if (!v4l2_subdev_enable_streams_api) return -ENOIOCTLCMD; if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) return -ENOIOCTLCMD; memset(routing->reserved, 0, sizeof(routing->reserved)); krouting = &state->routing; memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes, krouting->routes, min(krouting->num_routes, routing->len_routes) * sizeof(*krouting->routes)); routing->num_routes = krouting->num_routes; return 0; } case VIDIOC_SUBDEV_S_ROUTING: { struct v4l2_subdev_routing *routing = arg; struct v4l2_subdev_route *routes = (struct v4l2_subdev_route *)(uintptr_t)routing->routes; struct v4l2_subdev_krouting krouting = {}; unsigned int num_active_routes = 0; unsigned int i; if (!v4l2_subdev_enable_streams_api) return -ENOIOCTLCMD; if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) return -ENOIOCTLCMD; if (routing->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (routing->num_routes > routing->len_routes) return -EINVAL; memset(routing->reserved, 0, sizeof(routing->reserved)); for (i = 0; i < routing->num_routes; ++i) { const struct v4l2_subdev_route *route = &routes[i]; const struct media_pad *pads = sd->entity.pads; if (route->sink_stream > V4L2_SUBDEV_MAX_STREAM_ID || route->source_stream > V4L2_SUBDEV_MAX_STREAM_ID) return -EINVAL; if (route->sink_pad >= sd->entity.num_pads) return -EINVAL; if (!(pads[route->sink_pad].flags & MEDIA_PAD_FL_SINK)) return -EINVAL; if (route->source_pad >= sd->entity.num_pads) return -EINVAL; if (!(pads[route->source_pad].flags & MEDIA_PAD_FL_SOURCE)) return -EINVAL; if (route->flags & V4L2_SUBDEV_ROUTE_FL_ACTIVE) num_active_routes++; } /* * Drivers that implement routing need to report a frame * descriptor accordingly, with up to one entry per route. Until * the frame descriptors entries get allocated dynamically, * limit the number of active routes to * V4L2_FRAME_DESC_ENTRY_MAX. */ if (num_active_routes > V4L2_FRAME_DESC_ENTRY_MAX) return -E2BIG; /* * If the driver doesn't support setting routing, just return * the routing table. */ if (!v4l2_subdev_has_op(sd, pad, set_routing)) { memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes, state->routing.routes, min(state->routing.num_routes, routing->len_routes) * sizeof(*state->routing.routes)); routing->num_routes = state->routing.num_routes; return 0; } krouting.num_routes = routing->num_routes; krouting.len_routes = routing->len_routes; krouting.routes = routes; rval = v4l2_subdev_call(sd, pad, set_routing, state, routing->which, &krouting); if (rval < 0) return rval; memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes, state->routing.routes, min(state->routing.num_routes, routing->len_routes) * sizeof(*state->routing.routes)); routing->num_routes = state->routing.num_routes; return 0; } case VIDIOC_SUBDEV_G_CLIENT_CAP: { struct v4l2_subdev_client_capability *client_cap = arg; client_cap->capabilities = subdev_fh->client_caps; return 0; } case VIDIOC_SUBDEV_S_CLIENT_CAP: { struct v4l2_subdev_client_capability *client_cap = arg; /* * Clear V4L2_SUBDEV_CLIENT_CAP_STREAMS if streams API is not * enabled. Remove this when streams API is no longer * experimental. */ if (!v4l2_subdev_enable_streams_api) client_cap->capabilities &= ~V4L2_SUBDEV_CLIENT_CAP_STREAMS; /* Filter out unsupported capabilities */ client_cap->capabilities &= (V4L2_SUBDEV_CLIENT_CAP_STREAMS | V4L2_SUBDEV_CLIENT_CAP_INTERVAL_USES_WHICH); subdev_fh->client_caps = client_cap->capabilities; return 0; } default: return v4l2_subdev_call(sd, core, ioctl, cmd, arg); } return 0; } static long subdev_do_ioctl_lock(struct file *file, unsigned int cmd, void *arg) { struct video_device *vdev = video_devdata(file); struct mutex *lock = vdev->lock; long ret = -ENODEV; if (lock && mutex_lock_interruptible(lock)) return -ERESTARTSYS; if (video_is_registered(vdev)) { struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *vfh = file_to_v4l2_fh(file); struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); struct v4l2_subdev_state *state; state = subdev_ioctl_get_state(sd, subdev_fh, cmd, arg); if (state) v4l2_subdev_lock_state(state); ret = subdev_do_ioctl(file, cmd, arg, state); if (state) v4l2_subdev_unlock_state(state); } if (lock) mutex_unlock(lock); return ret; } static long subdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return video_usercopy(file, cmd, arg, subdev_do_ioctl_lock); } #ifdef CONFIG_COMPAT static long subdev_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); return v4l2_subdev_call(sd, core, compat_ioctl32, cmd, arg); } #endif #else /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static long subdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -ENODEV; } #ifdef CONFIG_COMPAT static long subdev_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg) { return -ENODEV; } #endif #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static __poll_t subdev_poll(struct file *file, poll_table *wait) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *fh = file_to_v4l2_fh(file); if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) return EPOLLERR; poll_wait(file, &fh->wait, wait); if (v4l2_event_pending(fh)) return EPOLLPRI; return 0; } const struct v4l2_file_operations v4l2_subdev_fops = { .owner = THIS_MODULE, .open = subdev_open, .unlocked_ioctl = subdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl32 = subdev_compat_ioctl32, #endif .release = subdev_close, .poll = subdev_poll, }; #ifdef CONFIG_MEDIA_CONTROLLER int v4l2_subdev_get_fwnode_pad_1_to_1(struct media_entity *entity, struct fwnode_endpoint *endpoint) { struct fwnode_handle *fwnode; struct v4l2_subdev *sd; if (!is_media_entity_v4l2_subdev(entity)) return -EINVAL; sd = media_entity_to_v4l2_subdev(entity); fwnode = fwnode_graph_get_port_parent(endpoint->local_fwnode); fwnode_handle_put(fwnode); if (device_match_fwnode(sd->dev, fwnode)) return endpoint->port; return -ENXIO; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_fwnode_pad_1_to_1); int v4l2_subdev_link_validate_default(struct v4l2_subdev *sd, struct media_link *link, struct v4l2_subdev_format *source_fmt, struct v4l2_subdev_format *sink_fmt) { bool pass = true; /* The width, height and code must match. */ if (source_fmt->format.width != sink_fmt->format.width) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: width does not match (source %u, sink %u)\n", __func__, source_fmt->format.width, sink_fmt->format.width); pass = false; } if (source_fmt->format.height != sink_fmt->format.height) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: height does not match (source %u, sink %u)\n", __func__, source_fmt->format.height, sink_fmt->format.height); pass = false; } if (source_fmt->format.code != sink_fmt->format.code) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: media bus code does not match (source 0x%8.8x, sink 0x%8.8x)\n", __func__, source_fmt->format.code, sink_fmt->format.code); pass = false; } /* The field order must match, or the sink field order must be NONE * to support interlaced hardware connected to bridges that support * progressive formats only. */ if (source_fmt->format.field != sink_fmt->format.field && sink_fmt->format.field != V4L2_FIELD_NONE) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: field does not match (source %u, sink %u)\n", __func__, source_fmt->format.field, sink_fmt->format.field); pass = false; } if (pass) return 0; dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: link was \"%s\":%u -> \"%s\":%u\n", __func__, link->source->entity->name, link->source->index, link->sink->entity->name, link->sink->index); return -EPIPE; } EXPORT_SYMBOL_GPL(v4l2_subdev_link_validate_default); static int v4l2_subdev_link_validate_get_format(struct media_pad *pad, u32 stream, struct v4l2_subdev_format *fmt, bool states_locked) { struct v4l2_subdev_state *state; struct v4l2_subdev *sd; int ret; sd = media_entity_to_v4l2_subdev(pad->entity); fmt->which = V4L2_SUBDEV_FORMAT_ACTIVE; fmt->pad = pad->index; fmt->stream = stream; if (states_locked) state = v4l2_subdev_get_locked_active_state(sd); else state = v4l2_subdev_lock_and_get_active_state(sd); ret = v4l2_subdev_call(sd, pad, get_fmt, state, fmt); if (!states_locked && state) v4l2_subdev_unlock_state(state); return ret; } #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static void __v4l2_link_validate_get_streams(struct media_pad *pad, u64 *streams_mask, bool states_locked) { struct v4l2_subdev_route *route; struct v4l2_subdev_state *state; struct v4l2_subdev *subdev; subdev = media_entity_to_v4l2_subdev(pad->entity); *streams_mask = 0; if (states_locked) state = v4l2_subdev_get_locked_active_state(subdev); else state = v4l2_subdev_lock_and_get_active_state(subdev); if (WARN_ON(!state)) return; for_each_active_route(&state->routing, route) { u32 route_pad; u32 route_stream; if (pad->flags & MEDIA_PAD_FL_SOURCE) { route_pad = route->source_pad; route_stream = route->source_stream; } else { route_pad = route->sink_pad; route_stream = route->sink_stream; } if (route_pad != pad->index) continue; *streams_mask |= BIT_ULL(route_stream); } if (!states_locked) v4l2_subdev_unlock_state(state); } #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static void v4l2_link_validate_get_streams(struct media_pad *pad, u64 *streams_mask, bool states_locked) { struct v4l2_subdev *subdev = media_entity_to_v4l2_subdev(pad->entity); if (!(subdev->flags & V4L2_SUBDEV_FL_STREAMS)) { /* Non-streams subdevs have an implicit stream 0 */ *streams_mask = BIT_ULL(0); return; } #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) __v4l2_link_validate_get_streams(pad, streams_mask, states_locked); #else /* This shouldn't happen */ *streams_mask = 0; #endif } static int v4l2_subdev_link_validate_locked(struct media_link *link, bool states_locked) { struct v4l2_subdev *sink_subdev = media_entity_to_v4l2_subdev(link->sink->entity); struct device *dev = sink_subdev->entity.graph_obj.mdev->dev; u64 source_streams_mask; u64 sink_streams_mask; u64 dangling_sink_streams; u32 stream; int ret; dev_dbg(dev, "validating link \"%s\":%u -> \"%s\":%u\n", link->source->entity->name, link->source->index, link->sink->entity->name, link->sink->index); v4l2_link_validate_get_streams(link->source, &source_streams_mask, states_locked); v4l2_link_validate_get_streams(link->sink, &sink_streams_mask, states_locked); /* * It is ok to have more source streams than sink streams as extra * source streams can just be ignored by the receiver, but having extra * sink streams is an error as streams must have a source. */ dangling_sink_streams = (source_streams_mask ^ sink_streams_mask) & sink_streams_mask; if (dangling_sink_streams) { dev_err(dev, "Dangling sink streams: mask %#llx\n", dangling_sink_streams); return -EINVAL; } /* Validate source and sink stream formats */ for (stream = 0; stream < sizeof(sink_streams_mask) * 8; ++stream) { struct v4l2_subdev_format sink_fmt, source_fmt; if (!(sink_streams_mask & BIT_ULL(stream))) continue; dev_dbg(dev, "validating stream \"%s\":%u:%u -> \"%s\":%u:%u\n", link->source->entity->name, link->source->index, stream, link->sink->entity->name, link->sink->index, stream); ret = v4l2_subdev_link_validate_get_format(link->source, stream, &source_fmt, states_locked); if (ret < 0) { dev_dbg(dev, "Failed to get format for \"%s\":%u:%u (but that's ok)\n", link->source->entity->name, link->source->index, stream); continue; } ret = v4l2_subdev_link_validate_get_format(link->sink, stream, &sink_fmt, states_locked); if (ret < 0) { dev_dbg(dev, "Failed to get format for \"%s\":%u:%u (but that's ok)\n", link->sink->entity->name, link->sink->index, stream); continue; } /* TODO: add stream number to link_validate() */ ret = v4l2_subdev_call(sink_subdev, pad, link_validate, link, &source_fmt, &sink_fmt); if (!ret) continue; if (ret != -ENOIOCTLCMD) return ret; ret = v4l2_subdev_link_validate_default(sink_subdev, link, &source_fmt, &sink_fmt); if (ret) return ret; } return 0; } int v4l2_subdev_link_validate(struct media_link *link) { struct v4l2_subdev *source_sd, *sink_sd; struct v4l2_subdev_state *source_state, *sink_state; bool states_locked; int ret; /* * Links are validated in the context of the sink entity. Usage of this * helper on a sink that is not a subdev is a clear driver bug. */ if (WARN_ON_ONCE(!is_media_entity_v4l2_subdev(link->sink->entity))) return -EINVAL; /* * If the source is a video device, delegate link validation to it. This * allows usage of this helper for subdev connected to a video output * device, provided that the driver implement the video output device's * .link_validate() operation. */ if (is_media_entity_v4l2_video_device(link->source->entity)) { struct media_entity *source = link->source->entity; if (!source->ops || !source->ops->link_validate) { /* * Many existing drivers do not implement the required * .link_validate() operation for their video devices. * Print a warning to get the drivers fixed, and return * 0 to avoid breaking userspace. This should * eventually be turned into a WARN_ON() when all * drivers will have been fixed. */ pr_warn_once("video device '%s' does not implement .link_validate(), driver bug!\n", source->name); return 0; } /* * Avoid infinite loops in case a video device incorrectly uses * this helper function as its .link_validate() handler. */ if (WARN_ON(source->ops->link_validate == v4l2_subdev_link_validate)) return -EINVAL; return source->ops->link_validate(link); } /* * If the source is still not a subdev, usage of this helper is a clear * driver bug. */ if (WARN_ON(!is_media_entity_v4l2_subdev(link->source->entity))) return -EINVAL; sink_sd = media_entity_to_v4l2_subdev(link->sink->entity); source_sd = media_entity_to_v4l2_subdev(link->source->entity); sink_state = v4l2_subdev_get_unlocked_active_state(sink_sd); source_state = v4l2_subdev_get_unlocked_active_state(source_sd); states_locked = sink_state && source_state; if (states_locked) v4l2_subdev_lock_states(sink_state, source_state); ret = v4l2_subdev_link_validate_locked(link, states_locked); if (states_locked) v4l2_subdev_unlock_states(sink_state, source_state); return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_link_validate); bool v4l2_subdev_has_pad_interdep(struct media_entity *entity, unsigned int pad0, unsigned int pad1) { struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity); struct v4l2_subdev_krouting *routing; struct v4l2_subdev_state *state; unsigned int i; state = v4l2_subdev_lock_and_get_active_state(sd); routing = &state->routing; for (i = 0; i < routing->num_routes; ++i) { struct v4l2_subdev_route *route = &routing->routes[i]; if (!(route->flags & V4L2_SUBDEV_ROUTE_FL_ACTIVE)) continue; if ((route->sink_pad == pad0 && route->source_pad == pad1) || (route->source_pad == pad0 && route->sink_pad == pad1)) { v4l2_subdev_unlock_state(state); return true; } } v4l2_subdev_unlock_state(state); return false; } EXPORT_SYMBOL_GPL(v4l2_subdev_has_pad_interdep); struct v4l2_subdev_state * __v4l2_subdev_state_alloc(struct v4l2_subdev *sd, const char *lock_name, struct lock_class_key *lock_key) { struct v4l2_subdev_state *state; int ret; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return ERR_PTR(-ENOMEM); __mutex_init(&state->_lock, lock_name, lock_key); if (sd->state_lock) state->lock = sd->state_lock; else state->lock = &state->_lock; state->sd = sd; /* Drivers that support streams do not need the legacy pad config */ if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS) && sd->entity.num_pads) { state->pads = kvcalloc(sd->entity.num_pads, sizeof(*state->pads), GFP_KERNEL); if (!state->pads) { ret = -ENOMEM; goto err; } } if (sd->internal_ops && sd->internal_ops->init_state) { /* * There can be no race at this point, but we lock the state * anyway to satisfy lockdep checks. */ v4l2_subdev_lock_state(state); ret = sd->internal_ops->init_state(sd, state); v4l2_subdev_unlock_state(state); if (ret) goto err; } return state; err: if (state && state->pads) kvfree(state->pads); kfree(state); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_alloc); void __v4l2_subdev_state_free(struct v4l2_subdev_state *state) { if (!state) return; mutex_destroy(&state->_lock); kfree(state->routing.routes); kvfree(state->stream_configs.configs); kvfree(state->pads); kfree(state); } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_free); int __v4l2_subdev_init_finalize(struct v4l2_subdev *sd, const char *name, struct lock_class_key *key) { struct v4l2_subdev_state *state; struct device *dev = sd->dev; bool has_disable_streams; bool has_enable_streams; bool has_s_stream; /* Check that the subdevice implements the required features */ has_s_stream = v4l2_subdev_has_op(sd, video, s_stream); has_enable_streams = v4l2_subdev_has_op(sd, pad, enable_streams); has_disable_streams = v4l2_subdev_has_op(sd, pad, disable_streams); if (has_enable_streams != has_disable_streams) { dev_err(dev, "subdev '%s' must implement both or neither of .enable_streams() and .disable_streams()\n", sd->name); return -EINVAL; } if (sd->flags & V4L2_SUBDEV_FL_STREAMS) { if (has_s_stream && !has_enable_streams) { dev_err(dev, "subdev '%s' must implement .enable/disable_streams()\n", sd->name); return -EINVAL; } } if (sd->ctrl_handler) sd->flags |= V4L2_SUBDEV_FL_HAS_EVENTS; state = __v4l2_subdev_state_alloc(sd, name, key); if (IS_ERR(state)) return PTR_ERR(state); sd->active_state = state; return 0; } EXPORT_SYMBOL_GPL(__v4l2_subdev_init_finalize); void v4l2_subdev_cleanup(struct v4l2_subdev *sd) { struct v4l2_async_subdev_endpoint *ase, *ase_tmp; __v4l2_subdev_state_free(sd->active_state); sd->active_state = NULL; /* Uninitialised sub-device, bail out here. */ if (!sd->async_subdev_endpoint_list.next) return; list_for_each_entry_safe(ase, ase_tmp, &sd->async_subdev_endpoint_list, async_subdev_endpoint_entry) { list_del(&ase->async_subdev_endpoint_entry); kfree(ase); } } EXPORT_SYMBOL_GPL(v4l2_subdev_cleanup); struct v4l2_mbus_framefmt * __v4l2_subdev_state_get_format(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON_ONCE(!state)) return NULL; if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].format; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].fmt; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_format); struct v4l2_rect * __v4l2_subdev_state_get_crop(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON_ONCE(!state)) return NULL; if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].crop; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].crop; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_crop); struct v4l2_rect * __v4l2_subdev_state_get_compose(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON_ONCE(!state)) return NULL; if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].compose; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].compose; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_compose); struct v4l2_fract * __v4l2_subdev_state_get_interval(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON(!state)) return NULL; lockdep_assert_held(state->lock); if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].interval; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].interval; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_interval); #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static int v4l2_subdev_init_stream_configs(struct v4l2_subdev_stream_configs *stream_configs, const struct v4l2_subdev_krouting *routing) { struct v4l2_subdev_stream_configs new_configs = { 0 }; struct v4l2_subdev_route *route; u32 idx; /* Count number of formats needed */ for_each_active_route(routing, route) { /* * Each route needs a format on both ends of the route. */ new_configs.num_configs += 2; } if (new_configs.num_configs) { new_configs.configs = kvcalloc(new_configs.num_configs, sizeof(*new_configs.configs), GFP_KERNEL); if (!new_configs.configs) return -ENOMEM; } /* * Fill in the 'pad' and stream' value for each item in the array from * the routing table */ idx = 0; for_each_active_route(routing, route) { new_configs.configs[idx].pad = route->sink_pad; new_configs.configs[idx].stream = route->sink_stream; idx++; new_configs.configs[idx].pad = route->source_pad; new_configs.configs[idx].stream = route->source_stream; idx++; } kvfree(stream_configs->configs); *stream_configs = new_configs; return 0; } int v4l2_subdev_get_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { struct v4l2_mbus_framefmt *fmt; fmt = v4l2_subdev_state_get_format(state, format->pad, format->stream); if (!fmt) return -EINVAL; format->format = *fmt; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_fmt); int v4l2_subdev_get_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { struct v4l2_fract *interval; interval = v4l2_subdev_state_get_interval(state, fi->pad, fi->stream); if (!interval) return -EINVAL; fi->interval = *interval; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_frame_interval); int v4l2_subdev_set_routing(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, const struct v4l2_subdev_krouting *routing) { struct v4l2_subdev_krouting *dst = &state->routing; const struct v4l2_subdev_krouting *src = routing; struct v4l2_subdev_krouting new_routing = { 0 }; size_t bytes; int r; if (unlikely(check_mul_overflow((size_t)src->num_routes, sizeof(*src->routes), &bytes))) return -EOVERFLOW; lockdep_assert_held(state->lock); if (src->num_routes > 0) { new_routing.routes = kmemdup(src->routes, bytes, GFP_KERNEL); if (!new_routing.routes) return -ENOMEM; } new_routing.num_routes = src->num_routes; r = v4l2_subdev_init_stream_configs(&state->stream_configs, &new_routing); if (r) { kfree(new_routing.routes); return r; } kfree(dst->routes); *dst = new_routing; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_set_routing); struct v4l2_subdev_route * __v4l2_subdev_next_active_route(const struct v4l2_subdev_krouting *routing, struct v4l2_subdev_route *route) { if (route) ++route; else route = &routing->routes[0]; for (; route < routing->routes + routing->num_routes; ++route) { if (!(route->flags & V4L2_SUBDEV_ROUTE_FL_ACTIVE)) continue; return route; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_next_active_route); int v4l2_subdev_set_routing_with_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, const struct v4l2_subdev_krouting *routing, const struct v4l2_mbus_framefmt *fmt) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; int ret; ret = v4l2_subdev_set_routing(sd, state, routing); if (ret) return ret; stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) stream_configs->configs[i].fmt = *fmt; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_set_routing_with_fmt); int v4l2_subdev_routing_find_opposite_end(const struct v4l2_subdev_krouting *routing, u32 pad, u32 stream, u32 *other_pad, u32 *other_stream) { unsigned int i; for (i = 0; i < routing->num_routes; ++i) { struct v4l2_subdev_route *route = &routing->routes[i]; if (route->source_pad == pad && route->source_stream == stream) { if (other_pad) *other_pad = route->sink_pad; if (other_stream) *other_stream = route->sink_stream; return 0; } if (route->sink_pad == pad && route->sink_stream == stream) { if (other_pad) *other_pad = route->source_pad; if (other_stream) *other_stream = route->source_stream; return 0; } } return -EINVAL; } EXPORT_SYMBOL_GPL(v4l2_subdev_routing_find_opposite_end); struct v4l2_mbus_framefmt * v4l2_subdev_state_get_opposite_stream_format(struct v4l2_subdev_state *state, u32 pad, u32 stream) { u32 other_pad, other_stream; int ret; ret = v4l2_subdev_routing_find_opposite_end(&state->routing, pad, stream, &other_pad, &other_stream); if (ret) return NULL; return v4l2_subdev_state_get_format(state, other_pad, other_stream); } EXPORT_SYMBOL_GPL(v4l2_subdev_state_get_opposite_stream_format); u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state, u32 pad0, u32 pad1, u64 *streams) { const struct v4l2_subdev_krouting *routing = &state->routing; struct v4l2_subdev_route *route; u64 streams0 = 0; u64 streams1 = 0; for_each_active_route(routing, route) { if (route->sink_pad == pad0 && route->source_pad == pad1 && (*streams & BIT_ULL(route->sink_stream))) { streams0 |= BIT_ULL(route->sink_stream); streams1 |= BIT_ULL(route->source_stream); } if (route->source_pad == pad0 && route->sink_pad == pad1 && (*streams & BIT_ULL(route->source_stream))) { streams0 |= BIT_ULL(route->source_stream); streams1 |= BIT_ULL(route->sink_stream); } } *streams = streams0; return streams1; } EXPORT_SYMBOL_GPL(v4l2_subdev_state_xlate_streams); int v4l2_subdev_routing_validate(struct v4l2_subdev *sd, const struct v4l2_subdev_krouting *routing, enum v4l2_subdev_routing_restriction disallow) { u32 *remote_pads = NULL; unsigned int i, j; int ret = -EINVAL; if (disallow & (V4L2_SUBDEV_ROUTING_NO_STREAM_MIX | V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING)) { remote_pads = kcalloc(sd->entity.num_pads, sizeof(*remote_pads), GFP_KERNEL); if (!remote_pads) return -ENOMEM; for (i = 0; i < sd->entity.num_pads; ++i) remote_pads[i] = U32_MAX; } for (i = 0; i < routing->num_routes; ++i) { const struct v4l2_subdev_route *route = &routing->routes[i]; /* Validate the sink and source pad numbers. */ if (route->sink_pad >= sd->entity.num_pads || !(sd->entity.pads[route->sink_pad].flags & MEDIA_PAD_FL_SINK)) { dev_dbg(sd->dev, "route %u sink (%u) is not a sink pad\n", i, route->sink_pad); goto out; } if (route->source_pad >= sd->entity.num_pads || !(sd->entity.pads[route->source_pad].flags & MEDIA_PAD_FL_SOURCE)) { dev_dbg(sd->dev, "route %u source (%u) is not a source pad\n", i, route->source_pad); goto out; } /* * V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX: all streams from a * sink pad must be routed to a single source pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX) { if (remote_pads[route->sink_pad] != U32_MAX && remote_pads[route->sink_pad] != route->source_pad) { dev_dbg(sd->dev, "route %u attempts to mix %s streams\n", i, "sink"); goto out; } } /* * V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX: all streams on a * source pad must originate from a single sink pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX) { if (remote_pads[route->source_pad] != U32_MAX && remote_pads[route->source_pad] != route->sink_pad) { dev_dbg(sd->dev, "route %u attempts to mix %s streams\n", i, "source"); goto out; } } /* * V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING: Pads on the sink * side can not do stream multiplexing, i.e. there can be only * a single stream in a sink pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING) { if (remote_pads[route->sink_pad] != U32_MAX) { dev_dbg(sd->dev, "route %u attempts to multiplex on %s pad %u\n", i, "sink", route->sink_pad); goto out; } } /* * V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING: Pads on the * source side can not do stream multiplexing, i.e. there can * be only a single stream in a source pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING) { if (remote_pads[route->source_pad] != U32_MAX) { dev_dbg(sd->dev, "route %u attempts to multiplex on %s pad %u\n", i, "source", route->source_pad); goto out; } } if (remote_pads) { remote_pads[route->sink_pad] = route->source_pad; remote_pads[route->source_pad] = route->sink_pad; } for (j = i + 1; j < routing->num_routes; ++j) { const struct v4l2_subdev_route *r = &routing->routes[j]; /* * V4L2_SUBDEV_ROUTING_NO_1_TO_N: No two routes can * originate from the same (sink) stream. */ if ((disallow & V4L2_SUBDEV_ROUTING_NO_1_TO_N) && route->sink_pad == r->sink_pad && route->sink_stream == r->sink_stream) { dev_dbg(sd->dev, "routes %u and %u originate from same sink (%u/%u)\n", i, j, route->sink_pad, route->sink_stream); goto out; } /* * V4L2_SUBDEV_ROUTING_NO_N_TO_1: No two routes can end * at the same (source) stream. */ if ((disallow & V4L2_SUBDEV_ROUTING_NO_N_TO_1) && route->source_pad == r->source_pad && route->source_stream == r->source_stream) { dev_dbg(sd->dev, "routes %u and %u end at same source (%u/%u)\n", i, j, route->source_pad, route->source_stream); goto out; } } } ret = 0; out: kfree(remote_pads); return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_routing_validate); static void v4l2_subdev_collect_streams(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 pad, u64 streams_mask, u64 *found_streams, u64 *enabled_streams) { if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) { *found_streams = BIT_ULL(0); *enabled_streams = (sd->enabled_pads & BIT_ULL(pad)) ? BIT_ULL(0) : 0; dev_dbg(sd->dev, "collect_streams: sub-device \"%s\" does not support streams\n", sd->entity.name); return; } *found_streams = 0; *enabled_streams = 0; for (unsigned int i = 0; i < state->stream_configs.num_configs; ++i) { const struct v4l2_subdev_stream_config *cfg = &state->stream_configs.configs[i]; if (cfg->pad != pad || !(streams_mask & BIT_ULL(cfg->stream))) continue; *found_streams |= BIT_ULL(cfg->stream); if (cfg->enabled) *enabled_streams |= BIT_ULL(cfg->stream); } dev_dbg(sd->dev, "collect_streams: \"%s\":%u: found %#llx enabled %#llx\n", sd->entity.name, pad, *found_streams, *enabled_streams); } static void v4l2_subdev_set_streams_enabled(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 pad, u64 streams_mask, bool enabled) { if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) { if (enabled) sd->enabled_pads |= BIT_ULL(pad); else sd->enabled_pads &= ~BIT_ULL(pad); return; } for (unsigned int i = 0; i < state->stream_configs.num_configs; ++i) { struct v4l2_subdev_stream_config *cfg = &state->stream_configs.configs[i]; if (cfg->pad == pad && (streams_mask & BIT_ULL(cfg->stream))) cfg->enabled = enabled; } } int v4l2_subdev_enable_streams(struct v4l2_subdev *sd, u32 pad, u64 streams_mask) { struct device *dev = sd->entity.graph_obj.mdev->dev; struct v4l2_subdev_state *state; bool already_streaming; u64 enabled_streams; u64 found_streams; bool use_s_stream; int ret; dev_dbg(dev, "enable streams \"%s\":%u/%#llx\n", sd->entity.name, pad, streams_mask); /* A few basic sanity checks first. */ if (pad >= sd->entity.num_pads) return -EINVAL; if (!(sd->entity.pads[pad].flags & MEDIA_PAD_FL_SOURCE)) return -EOPNOTSUPP; /* * We use a 64-bit bitmask for tracking enabled pads, so only subdevices * with 64 pads or less can be supported. */ if (pad >= sizeof(sd->enabled_pads) * BITS_PER_BYTE) return -EOPNOTSUPP; if (!streams_mask) return 0; /* Fallback on .s_stream() if .enable_streams() isn't available. */ use_s_stream = !v4l2_subdev_has_op(sd, pad, enable_streams); if (!use_s_stream) state = v4l2_subdev_lock_and_get_active_state(sd); else state = NULL; /* * Verify that the requested streams exist and that they are not * already enabled. */ v4l2_subdev_collect_streams(sd, state, pad, streams_mask, &found_streams, &enabled_streams); if (found_streams != streams_mask) { dev_dbg(dev, "streams 0x%llx not found on %s:%u\n", streams_mask & ~found_streams, sd->entity.name, pad); ret = -EINVAL; goto done; } if (enabled_streams) { dev_dbg(dev, "streams 0x%llx already enabled on %s:%u\n", enabled_streams, sd->entity.name, pad); ret = -EALREADY; goto done; } already_streaming = v4l2_subdev_is_streaming(sd); if (!use_s_stream) { /* Call the .enable_streams() operation. */ ret = v4l2_subdev_call(sd, pad, enable_streams, state, pad, streams_mask); } else { /* Start streaming when the first pad is enabled. */ if (!already_streaming) ret = v4l2_subdev_call(sd, video, s_stream, 1); else ret = 0; } if (ret) { dev_dbg(dev, "enable streams %u:%#llx failed: %d\n", pad, streams_mask, ret); goto done; } /* Mark the streams as enabled. */ v4l2_subdev_set_streams_enabled(sd, state, pad, streams_mask, true); /* * TODO: When all the drivers have been changed to use * v4l2_subdev_enable_streams() and v4l2_subdev_disable_streams(), * instead of calling .s_stream() operation directly, we can remove * the privacy LED handling from call_s_stream() and do it here * for all cases. */ if (!use_s_stream && !already_streaming) v4l2_subdev_enable_privacy_led(sd); done: if (!use_s_stream) v4l2_subdev_unlock_state(state); return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_enable_streams); int v4l2_subdev_disable_streams(struct v4l2_subdev *sd, u32 pad, u64 streams_mask) { struct device *dev = sd->entity.graph_obj.mdev->dev; struct v4l2_subdev_state *state; u64 enabled_streams; u64 found_streams; bool use_s_stream; int ret; dev_dbg(dev, "disable streams \"%s\":%u/%#llx\n", sd->entity.name, pad, streams_mask); /* A few basic sanity checks first. */ if (pad >= sd->entity.num_pads) return -EINVAL; if (!(sd->entity.pads[pad].flags & MEDIA_PAD_FL_SOURCE)) return -EOPNOTSUPP; /* * We use a 64-bit bitmask for tracking enabled pads, so only subdevices * with 64 pads or less can be supported. */ if (pad >= sizeof(sd->enabled_pads) * BITS_PER_BYTE) return -EOPNOTSUPP; if (!streams_mask) return 0; /* Fallback on .s_stream() if .disable_streams() isn't available. */ use_s_stream = !v4l2_subdev_has_op(sd, pad, disable_streams); if (!use_s_stream) state = v4l2_subdev_lock_and_get_active_state(sd); else state = NULL; /* * Verify that the requested streams exist and that they are not * already disabled. */ v4l2_subdev_collect_streams(sd, state, pad, streams_mask, &found_streams, &enabled_streams); if (found_streams != streams_mask) { dev_dbg(dev, "streams 0x%llx not found on %s:%u\n", streams_mask & ~found_streams, sd->entity.name, pad); ret = -EINVAL; goto done; } if (enabled_streams != streams_mask) { dev_dbg(dev, "streams 0x%llx already disabled on %s:%u\n", streams_mask & ~enabled_streams, sd->entity.name, pad); ret = -EALREADY; goto done; } if (!use_s_stream) { /* Call the .disable_streams() operation. */ ret = v4l2_subdev_call(sd, pad, disable_streams, state, pad, streams_mask); } else { /* Stop streaming when the last streams are disabled. */ if (!(sd->enabled_pads & ~BIT_ULL(pad))) ret = v4l2_subdev_call(sd, video, s_stream, 0); else ret = 0; } if (ret) { dev_dbg(dev, "disable streams %u:%#llx failed: %d\n", pad, streams_mask, ret); goto done; } v4l2_subdev_set_streams_enabled(sd, state, pad, streams_mask, false); done: if (!use_s_stream) { if (!v4l2_subdev_is_streaming(sd)) v4l2_subdev_disable_privacy_led(sd); v4l2_subdev_unlock_state(state); } return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_disable_streams); int v4l2_subdev_s_stream_helper(struct v4l2_subdev *sd, int enable) { struct v4l2_subdev_state *state; struct v4l2_subdev_route *route; struct media_pad *pad; u64 source_mask = 0; int pad_index = -1; /* * Find the source pad. This helper is meant for subdevs that have a * single source pad, so failures shouldn't happen, but catch them * loudly nonetheless as they indicate a driver bug. */ media_entity_for_each_pad(&sd->entity, pad) { if (pad->flags & MEDIA_PAD_FL_SOURCE) { pad_index = pad->index; break; } } if (WARN_ON(pad_index == -1)) return -EINVAL; if (sd->flags & V4L2_SUBDEV_FL_STREAMS) { /* * As there's a single source pad, just collect all the source * streams. */ state = v4l2_subdev_lock_and_get_active_state(sd); for_each_active_route(&state->routing, route) source_mask |= BIT_ULL(route->source_stream); v4l2_subdev_unlock_state(state); } else { /* * For non-streams subdevices, there's a single implicit stream * per pad. */ source_mask = BIT_ULL(0); } if (enable) return v4l2_subdev_enable_streams(sd, pad_index, source_mask); else return v4l2_subdev_disable_streams(sd, pad_index, source_mask); } EXPORT_SYMBOL_GPL(v4l2_subdev_s_stream_helper); #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ #endif /* CONFIG_MEDIA_CONTROLLER */ void v4l2_subdev_init(struct v4l2_subdev *sd, const struct v4l2_subdev_ops *ops) { INIT_LIST_HEAD(&sd->list); BUG_ON(!ops); sd->ops = ops; sd->v4l2_dev = NULL; sd->flags = 0; sd->name[0] = '\0'; sd->grp_id = 0; sd->dev_priv = NULL; sd->host_priv = NULL; sd->privacy_led = NULL; INIT_LIST_HEAD(&sd->async_subdev_endpoint_list); #if defined(CONFIG_MEDIA_CONTROLLER) sd->entity.name = sd->name; sd->entity.obj_type = MEDIA_ENTITY_TYPE_V4L2_SUBDEV; sd->entity.function = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN; #endif } EXPORT_SYMBOL(v4l2_subdev_init); void v4l2_subdev_notify_event(struct v4l2_subdev *sd, const struct v4l2_event *ev) { v4l2_event_queue(sd->devnode, ev); v4l2_subdev_notify(sd, V4L2_DEVICE_NOTIFY_EVENT, (void *)ev); } EXPORT_SYMBOL_GPL(v4l2_subdev_notify_event); bool v4l2_subdev_is_streaming(struct v4l2_subdev *sd) { struct v4l2_subdev_state *state; if (!v4l2_subdev_has_op(sd, pad, enable_streams)) return sd->s_stream_enabled; if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) return !!sd->enabled_pads; state = v4l2_subdev_get_locked_active_state(sd); for (unsigned int i = 0; i < state->stream_configs.num_configs; ++i) { const struct v4l2_subdev_stream_config *cfg; cfg = &state->stream_configs.configs[i]; if (cfg->enabled) return true; } return false; } EXPORT_SYMBOL_GPL(v4l2_subdev_is_streaming); int v4l2_subdev_get_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) sd->privacy_led = led_get(sd->dev, "privacy"); if (IS_ERR(sd->privacy_led) && PTR_ERR(sd->privacy_led) != -ENOENT) return dev_err_probe(sd->dev, PTR_ERR(sd->privacy_led), "getting privacy LED\n"); if (!IS_ERR_OR_NULL(sd->privacy_led)) { mutex_lock(&sd->privacy_led->led_access); led_sysfs_disable(sd->privacy_led); led_trigger_remove(sd->privacy_led); led_set_brightness(sd->privacy_led, 0); mutex_unlock(&sd->privacy_led->led_access); } #endif return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_privacy_led); void v4l2_subdev_put_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) if (!IS_ERR_OR_NULL(sd->privacy_led)) { mutex_lock(&sd->privacy_led->led_access); led_sysfs_enable(sd->privacy_led); mutex_unlock(&sd->privacy_led->led_access); led_put(sd->privacy_led); } #endif } EXPORT_SYMBOL_GPL(v4l2_subdev_put_privacy_led);
7 9 6 6 6 7 9 8 9 9 9 9 9 8 221 219 219 219 159 3 160 30 211 211 194 196 197 186 8 9 218 9 9 213 220 219 220 8 1 7 220 10 9 9 221 221 218 221 223 206 230 222 231 235 234 73 1 73 1 221 1 219 223 216 234 234 231 235 235 231 232 229 235 218 219 21 21 21 21 21 19 2 1 20 19 21 152 151 63 245 6 242 227 234 21 21 231 226 102 157 101 89 55 88 14 151 153 154 154 151 13 150 35 145 11 151 149 144 146 146 148 242 146 148 73 74 61 75 73 75 74 75 32 32 31 32 32 32 46 46 44 44 46 45 46 46 46 46 46 44 6 6 5 1 1 1 1 1 39 38 39 38 2 2 1 8 3 2 1 8 12 44 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> #include <linux/mmu_context.h> #include <linux/swap.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include "internal.h" /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). */ static int real_depth(int depth) { if (depth == 3 && PTRS_PER_PMD == 1) depth = 2; if (depth == 2 && PTRS_PER_PUD == 1) depth = 1; if (depth == 1 && PTRS_PER_P4D == 1) depth = 0; return depth; } static int walk_pte_range_inner(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { const struct mm_walk_ops *ops = walk->ops; int err = 0; for (;;) { if (ops->install_pte && pte_none(ptep_get(pte))) { pte_t new_pte; err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte, walk); if (err) break; set_pte_at(walk->mm, addr, pte, new_pte); /* Non-present before, so for arches that need it. */ if (!WARN_ON_ONCE(walk->no_vma)) update_mmu_cache(walk->vma, addr, pte); } else { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; } if (addr >= end - PAGE_SIZE) break; addr += PAGE_SIZE; pte++; } return err; } static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t *pte; int err = 0; spinlock_t *ptl; if (walk->no_vma) { /* * pte_offset_map() might apply user-specific validation. * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. */ if (walk->mm == &init_mm || addr >= TASK_SIZE) pte = pte_offset_kernel(pmd, addr); else pte = pte_offset_map(pmd, addr); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); if (walk->mm != &init_mm && addr < TASK_SIZE) pte_unmap(pte); } } else { pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); pte_unmap_unlock(pte, ptl); } } if (!pte) walk->action = ACTION_AGAIN; return err; } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(3); pmd = pmd_offset(pud, addr); do { again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) { if (has_install) err = __pte_alloc(walk->mm, pmd); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } walk->action = ACTION_SUBTREE; /* * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ if (ops->pmd_entry) err = ops->pmd_entry(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if (walk->action == ACTION_CONTINUE) continue; if (!has_handler) { /* No handlers for lower page tables. */ if (!has_install) continue; /* Nothing to do. */ /* * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ if (pmd_present(*pmd) && pmd_trans_huge(*pmd)) continue; } if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); else if (pmd_leaf(*pmd) || !pmd_present(*pmd)) continue; /* Nothing to do. */ err = walk_pte_range(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; } while (pmd++, addr = next, addr != end); return err; } static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(2); pud = pud_offset(p4d, addr); do { again: next = pud_addr_end(addr, end); if (pud_none(*pud)) { if (has_install) err = __pmd_alloc(walk->mm, pud, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } walk->action = ACTION_SUBTREE; if (ops->pud_entry) err = ops->pud_entry(pud, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if (walk->action == ACTION_CONTINUE) continue; if (!has_handler) { /* No handlers for lower page tables. */ if (!has_install) continue; /* Nothing to do. */ /* * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ if (pud_present(*pud) && pud_trans_huge(*pud)) continue; } if (walk->vma) split_huge_pud(walk->vma, pud, addr); else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ if (pud_none(*pud)) goto again; err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, struct mm_walk *walk) { p4d_t *p4d; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(1); p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { if (has_install) err = __pud_alloc(walk->mm, p4d, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } if (ops->p4d_entry) { err = ops->p4d_entry(p4d, addr, next, walk); if (err) break; } if (has_handler || has_install) err = walk_pud_range(p4d, addr, next, walk); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { pgd_t *pgd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; if (walk->pgd) pgd = walk->pgd + pgd_index(addr); else pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (has_install) err = __p4d_alloc(walk->mm, pgd, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, 0, walk); if (err) break; if (!has_install) continue; } if (ops->pgd_entry) { err = ops->pgd_entry(pgd, addr, next, walk); if (err) break; } if (has_handler || has_install) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); return err; } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) { unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); return boundary < end ? boundary : end; } static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); pte_t *pte; const struct mm_walk_ops *ops = walk->ops; int err = 0; hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); if (err) break; } while (addr = next, addr != end); hugetlb_vma_unlock_read(vma); return err; } #else /* CONFIG_HUGETLB_PAGE */ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ /* * Decide whether we really walk over the current vma on [@start, @end) * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; if (ops->test_walk) return ops->test_walk(start, end, walk); /* * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP * range, so we don't walk over it as we do for normal vmas. However, * Some callers are interested in handling hole range and they don't * want to just ignore any single address range. Such users certainly * define their ->pte_hole() callbacks, so let's delegate them to handle * vma(VM_PFNMAP). */ if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) err = ops->pte_hole(start, end, -1, walk); return err ? err : 1; } return 0; } static int __walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { int err = 0; struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; bool is_hugetlb = is_vm_hugetlb_page(vma); /* We do not support hugetlb PTE installation. */ if (ops->install_pte && is_hugetlb) return -EINVAL; if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); if (err) return err; } if (is_hugetlb) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); if (ops->post_vma) ops->post_vma(walk); return err; } static inline void process_mm_walk_lock(struct mm_struct *mm, enum page_walk_lock walk_lock) { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) mmap_assert_write_locked(mm); } static inline void process_vma_walk_lock(struct vm_area_struct *vma, enum page_walk_lock walk_lock) { #ifdef CONFIG_PER_VMA_LOCK switch (walk_lock) { case PGWALK_WRLOCK: vma_start_write(vma); break; case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; case PGWALK_VMA_RDLOCK_VERIFY: vma_assert_locked(vma); break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; } #endif } /* * See the comment for walk_page_range(), this performs the heavy lifting of the * operation, only sets no restrictions on how the walk proceeds. * * We usually restrict the ability to install PTEs, but this functionality is * available to internal memory management code and provided in mm/internal.h. */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { int err = 0; unsigned long next; struct vm_area_struct *vma; struct mm_walk walk = { .ops = ops, .mm = mm, .private = private, }; if (start >= end) return -EINVAL; if (!walk.mm) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { /* * positive return values are purely for * controlling the pagewalk, so should never * be passed to the callers. */ err = 0; continue; } if (err < 0) break; err = __walk_page_range(start, next, &walk); } if (err) break; } while (start = next, start < end); return err; } /* * Determine if the walk operations specified are permitted to be used for a * page table walk. * * This check is performed on all functions which are parameterised by walk * operations and exposed in include/linux/pagewalk.h. * * Internal memory management code can use the walk_page_range_mm() function to * be able to use all page walking operations. */ static bool check_ops_valid(const struct mm_walk_ops *ops) { /* * The installation of PTEs is solely under the control of memory * management logic and subject to many subtle locking, security and * cache considerations so we cannot permit other users to do so, and * certainly not for exported symbols. */ if (ops->install_pte) return false; return true; } /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @private: private data for callbacks' usage * * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these * callbacks, the associated entries/pages are just ignored. * The return values of these callbacks are commonly defined like below: * * - 0 : succeeded to handle the current entry, and if you don't reach the * end address yet, continue to walk. * - >0 : succeeded to handle the current entry, and return to the caller * with caller specific value. * - <0 : failed to handle the current entry, and return to the caller * with error code. * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * If operations need to be staged before and committed after a vma is walked, * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), * since it is intended to handle commit-type operations, can't return any * errors. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. * * Locking: * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, * because these function traverse vma list and/or access to vma's data. */ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { if (!check_ops_valid(ops)) return -EINVAL; return walk_page_range_mm(mm, start, end, ops, private); } /** * walk_kernel_page_table_range - walk a range of kernel pagetables. * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for * walking kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { /* * Kernel intermediate page tables are usually not freed, so the mmap * read lock is sufficient. But there are some exceptions. * E.g. memory hot-remove. In which case, the mmap lock is insufficient * to prevent the intermediate kernel pages tables belonging to the * specified address range from being freed. The caller should take * other actions to prevent this race. */ mmap_assert_locked(&init_mm); return walk_kernel_page_table_range_lockless(start, end, ops, pgd, private); } /* * Use this function to walk the kernel page tables locklessly. It should be * guaranteed that the caller has exclusive access over the range they are * operating on - that there should be no concurrent access, for example, * changing permissions for vmalloc objects. */ int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true }; if (start >= end) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; return walk_pgd_range(start, end, &walk); } /** * walk_page_range_debug - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. * * This is for debugging purposes ONLY. */ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = mm, .pgd = pgd, .private = private, .no_vma = true }; /* For convenience, we allow traversal of kernel mappings. */ if (mm == &init_mm) return walk_kernel_page_table_range(start, end, ops, pgd, private); if (start >= end || !walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; /* * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should * be held. */ mmap_assert_write_locked(mm); return walk_pgd_range(start, end, &walk); } int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (start >= end || !walk.mm) return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (!walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } /** * walk_page_mapping - walk all memory areas mapped into a struct address_space. * @mapping: Pointer to the struct address_space * @first_index: First page offset in the address_space * @nr: Number of incremental page offsets to cover * @ops: operation to call during the walk * @private: private data for callbacks' usage * * This function walks all memory areas mapped into a struct address_space. * The walk is limited to only the given page-size index range, but if * the index boundaries cross a huge page-table entry, that entry will be * included. * * Also see walk_page_range() for additional information. * * Locking: * This function can't require that the struct mm_struct::mmap_lock is held, * since @mapping may be mapped by multiple processes. Instead * @mapping->i_mmap_rwsem must be held. This might have implications in the * callbacks, and it's up tho the caller to ensure that the * struct mm_struct::mmap_lock is not needed. * * Also this means that a caller can't rely on the struct * vm_area_struct::vm_flags to be constant across a call, * except for immutable flags. Callers requiring this shouldn't use * this function. * * Return: 0 on success, negative error code on failure, positive number on * caller defined premature termination. */ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .private = private, }; struct vm_area_struct *vma; pgoff_t vba, vea, cba, cea; unsigned long start_addr, end_addr; int err = 0; if (!check_ops_valid(ops)) return -EINVAL; lockdep_assert_held(&mapping->i_mmap_rwsem); vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, first_index + nr - 1) { /* Clip to the vma */ vba = vma->vm_pgoff; vea = vba + vma_pages(vma); cba = first_index; cba = max(cba, vba); cea = first_index + nr; cea = min(cea, vea); start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; if (start_addr >= end_addr) continue; walk.vma = vma; walk.mm = vma->vm_mm; err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) { err = 0; break; } else if (err < 0) break; err = __walk_page_range(start_addr, end_addr, &walk); if (err) break; } return err; } /** * folio_walk_start - walk the page tables to a folio * @fw: filled with information on success. * @vma: the VMA. * @addr: the virtual address to use for the page table walk. * @flags: flags modifying which folios to walk to. * * Walk the page tables using @addr in a given @vma to a mapped folio and * return the folio, making sure that the page table entry referenced by * @addr cannot change until folio_walk_end() was called. * * As default, this function returns only folios that are not special (e.g., not * the zeropage) and never returns folios that are supposed to be ignored by the * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * * As default, this function only considers present page table entries. * If requested, it will also consider migration entries. * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * * On success, @fw is filled and the function returns the folio while the PTL * is still held and folio_walk_end() must be called to clean up, * releasing any held locks. The returned folio must *not* be used after the * call to folio_walk_end(), unless a short-term folio reference is taken before * that call. * * @fw->page will correspond to the page that is effectively referenced by * @addr. However, for migration entries and shared zeropages @fw->page is * set to NULL. Note that large folios might be mapped by multiple page table * entries, and this function will always only lookup a single entry as * specified by @addr, which might or might not cover more than a single page of * the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or * to carelessly modify page content. This function may *only* be used to grab * short-term folio references, never to grab long-term folio references. * * Using the page table entry pointers in @fw for reading or modifying the * entry should be avoided where possible: however, there might be valid * use cases. * * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. * For example, PMD page table sharing might require prior unsharing. Also, * logical hugetlb entries might span multiple physical page table entries, * which *must* be modified in a single operation (set_huge_pte_at(), * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might * not correspond to the first physical entry of a logical hugetlb entry. * * The mmap lock must be held in read mode. * * Return: folio pointer on success, otherwise NULL. */ struct folio *folio_walk_start(struct folio_walk *fw, struct vm_area_struct *vma, unsigned long addr, folio_walk_flags_t flags) { unsigned long entry_size; bool expose_page = true; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; spinlock_t *ptl; pgd_t *pgdp; p4d_t *p4dp; mmap_assert_locked(vma->vm_mm); vma_pgtable_walk_begin(vma); if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) goto not_found; pgdp = pgd_offset(vma->vm_mm, addr); if (pgd_none_or_clear_bad(pgdp)) goto not_found; p4dp = p4d_offset(pgdp, addr); if (p4d_none_or_clear_bad(p4dp)) goto not_found; pudp = pud_offset(p4dp, addr); pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); entry_size = PUD_SIZE; fw->level = FW_LEVEL_PUD; fw->pudp = pudp; fw->pud = pud; if (pud_none(pud)) { spin_unlock(ptl); goto not_found; } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; } else if (pud_present(pud)) { page = vm_normal_page_pud(vma, addr, pud); if (page) goto found; } /* * TODO: FW_MIGRATION support for PUD migration entries * once there are relevant users. */ spin_unlock(ptl); goto not_found; } pmd_table: VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); entry_size = PMD_SIZE; fw->level = FW_LEVEL_PMD; fw->pmdp = pmdp; fw->pmd = pmd; if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { page = vm_normal_page_pmd(vma, addr, pmd); if (page) { goto found; } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); expose_page = false; goto found; } } else if ((flags & FW_MIGRATION) && is_pmd_migration_entry(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); page = pfn_swap_entry_to_page(entry); expose_page = false; goto found; } spin_unlock(ptl); goto not_found; } pte_table: VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; pte = ptep_get(ptep); entry_size = PAGE_SIZE; fw->level = FW_LEVEL_PTE; fw->ptep = ptep; fw->pte = pte; if (pte_present(pte)) { page = vm_normal_page(vma, addr, pte); if (page) goto found; if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); expose_page = false; goto found; } } else if (!pte_none(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); if ((flags & FW_MIGRATION) && is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); expose_page = false; goto found; } } pte_unmap_unlock(ptep, ptl); not_found: vma_pgtable_walk_end(vma); return NULL; found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; return page_folio(page); }
97 20 98 11 73 34 81 81 81 81 81 81 79 114 1 152 49 153 14 152 13 153 19 152 2 1 41 41 2 1 2 1 2 2 2 1 1 1 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GRE_H #define __LINUX_GRE_H #include <linux/skbuff.h> #include <net/ip_tunnels.h> struct gre_base_hdr { __be16 flags; __be16 protocol; } __packed; struct gre_full_hdr { struct gre_base_hdr fixed_header; __be16 csum; __be16 reserved1; __be32 key; __be32 seq; } __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 #define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); }; int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); static inline bool netif_is_gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "gretap"); } static inline bool netif_is_ip6gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } static inline int gre_calc_hlen(const unsigned long *o_flags) { int addend = 4; if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags)) addend += 4; if (test_bit(IP_TUNNEL_KEY_BIT, o_flags)) addend += 4; if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags)) addend += 4; return addend; } static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags) { IP_TUNNEL_DECLARE_FLAGS(res) = { }; __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM); __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING); __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY); __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ); __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT); __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC); __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION); ip_tunnel_flags_copy(dst, res); } static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags) { __be16 flags = 0; if (test_bit(IP_TUNNEL_CSUM_BIT, tflags)) flags |= GRE_CSUM; if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags)) flags |= GRE_ROUTING; if (test_bit(IP_TUNNEL_KEY_BIT, tflags)) flags |= GRE_KEY; if (test_bit(IP_TUNNEL_SEQ_BIT, tflags)) flags |= GRE_SEQ; if (test_bit(IP_TUNNEL_STRICT_BIT, tflags)) flags |= GRE_STRICT; if (test_bit(IP_TUNNEL_REC_BIT, tflags)) flags |= GRE_REC; if (test_bit(IP_TUNNEL_VERSION_BIT, tflags)) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, const unsigned long *flags, __be16 proto, __be32 key, __be32 seq) { IP_TUNNEL_DECLARE_FLAGS(cond) = { }; struct gre_base_hdr *greh; skb_push(skb, hdr_len); skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; __set_bit(IP_TUNNEL_KEY_BIT, cond); __set_bit(IP_TUNNEL_CSUM_BIT, cond); __set_bit(IP_TUNNEL_SEQ_BIT, cond); if (ip_tunnel_flags_intersect(flags, cond)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) { *ptr = seq; ptr--; } if (test_bit(IP_TUNNEL_KEY_BIT, flags)) { *ptr = key; ptr--; } if (test_bit(IP_TUNNEL_CSUM_BIT, flags) && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; if (skb->ip_summed == CHECKSUM_PARTIAL) { *(__sum16 *)ptr = csum_fold(lco_csum(skb)); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } } } #endif
114 115 36 45 23 4 62 13 10 10 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_NULLS_H #define _LINUX_RCULIST_NULLS_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list_nulls.h> #include <linux/rcupdate.h> /** * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_nulls_add_head_rcu() or * hlist_nulls_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * hlist_nulls_first_rcu - returns the first element of the hash list. * @head: the head of the list. */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) /** * hlist_nulls_next_rcu - returns the element of the list after @node. * @node: element of the list. */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry(). */ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; !is_a_nulls(i); i = i->next) last = i; if (last) { WRITE_ONCE(n->next, last->next); n->pprev = &last->next; rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } } /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { n->pprev = &n->next; n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) /** * hlist_nulls_for_each_entry_safe - * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif
4201 1269 1269 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for SMC Connections, Link Groups and Links * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef _SMC_CORE_H #define _SMC_CORE_H #include <linux/atomic.h> #include <linux/types.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> #include <net/genetlink.h> #include <net/smc.h> #include "smc.h" #include "smc_ib.h" #include "smc_clc.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ #define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ #define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, * also is the default value for SMC-R v1 and v2.0 */ #define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 16-255 as needed. */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ u32 num; /* unique link group number */ }; enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ }; enum smc_link_state { /* possible states of a link */ SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ #define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; struct smc_wr_v2_buf { u8 raw[SMC_WR_BUF_V2_SIZE]; }; #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { POSTED, /* ib_wr_reg_mr request posted */ CONFIRMED, /* ib_wr_reg_mr response: successful */ FAILED /* ib_wr_reg_mr response: failure */ }; struct smc_rdma_sge { /* sges for RDMA writes */ struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE]; }; #define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per * message send */ struct smc_rdma_sges { /* sges per message send */ struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES]; }; struct smc_rdma_wr { /* work requests per message * send */ struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; #define SMC_LGR_ID_SIZE 4 struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ struct ib_sge *wr_tx_sges; /* WR send gather meta data */ struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ struct { struct percpu_ref wr_tx_refs; } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ int wr_rx_buflen; /* buffer len for the first sge, len for the * second sge is lgr shared if rx sge is 2. */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ struct { struct percpu_ref wr_reg_refs; } ____cacheline_aligned_in_smp; struct completion reg_ref_comp; enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ enum ib_mtu peer_mtu; /* mtu size of peer */ u32 psn_initial; /* QP tx initial packet seqno */ u32 peer_psn; /* QP rx initial packet seqno */ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ u8 link_idx; /* index in lgr link array */ u8 link_is_asym; /* is link asymmetric? */ u8 clearing : 1; /* link is being cleared */ refcount_t refcnt; /* link reference count */ struct smc_link_group *lgr; /* parent link group */ struct work_struct link_down_wrk; /* wrk to bring link down */ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ #define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the * default value for smc-r v1.0 and v2.0 */ #define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for * SMC-R v2.1 and later negotiation, vendors or * distributions may modify it to a value between * 1-2 as needed. */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ union { struct { /* SMC-R */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; /* virtual buffer */ struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; /* memory region: for rmb and * vzalloced sndbuf * incl. rkey provided to peer * and lkey provided to local */ u32 order; /* allocation order */ u8 is_conf_rkey; /* confirm_rkey done */ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; /* mem region registered */ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; /* mem region mapped to lnk */ u8 is_dma_need_sync; u8 is_reg_err; /* buffer registration err */ u8 is_vm; /* virtually contiguous */ }; struct { /* SMC-D */ /* SMC-D tx buffer */ bool is_attached; /* no need for explicit writes */ /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; /* DMB token number */ dma_addr_t dma_addr; /* DMA address */ }; }; }; struct smc_rtoken { /* address/key of remote RMB */ u64 dma_addr; u32 rkey; }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ struct smcd_dev; enum smc_lgr_type { /* redundancy state of lgr */ SMC_LGR_NONE, /* no active links, lgr to be deleted */ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ }; enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ SMCR_PHYS_CONT_BUFS = 0, SMCR_VIRT_CONT_BUFS = 1, SMCR_MIXED_BUFS = 2, }; enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, SMC_LLC_FLOW_DEL_LINK = 4, SMC_LLC_FLOW_REQ_ADD_LINK = 5, SMC_LLC_FLOW_RKEY = 6, }; struct smc_llc_qentry; struct smc_llc_flow { enum smc_llc_flowtype type; struct smc_llc_qentry *qentry; }; struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct rw_semaphore rmbs_lock; /* protects rx buffers */ u64 alloc_sndbufs; /* stats of tx buffers */ u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ struct work_struct terminate_work; /* abnormal lgr termination */ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ u8 freeing : 1; /* lgr is being freed */ refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ u8 smc_version; u8 negotiated_eid[SMC_MAX_EID_LEN]; u8 peer_os; /* peer operating system */ u8 peer_smc_release; u8 peer_hostname[SMC_MAX_HOSTNAME_LEN]; union { struct { /* SMC-R */ enum smc_lgr_role role; /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ struct smc_wr_v2_buf *wr_rx_buf_v2; /* WR v2 recv payload buffer */ struct smc_wr_v2_buf *wr_tx_buf_v2; /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] [SMC_LINKS_PER_LGR_MAX]; /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ u8 next_link_id; enum smc_lgr_type type; enum smcr_buf_type buf_type; /* redundancy state */ u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; /* pnet id of this lgr */ struct list_head llc_event_q; /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; struct work_struct llc_event_work; /* llc event worker */ wait_queue_head_t llc_flow_waiter; /* w4 next llc event */ wait_queue_head_t llc_msg_waiter; /* w4 next llc msg */ struct smc_llc_flow llc_flow_lcl; /* llc local control field */ struct smc_llc_flow llc_flow_rmt; /* llc remote control field */ struct smc_llc_qentry *delayed_event; /* arrived when flow active */ spinlock_t llc_flow_lock; /* protects llc flow */ int llc_testlink_time; /* link keep alive time */ u32 llc_termination_rsn; /* rsn code for termination */ u8 nexthop_mac[ETH_ALEN]; u8 uses_gateway; __be32 saddr; /* net namespace */ struct net *net; u8 max_conns; /* max conn can be assigned to lgr */ u8 max_links; /* max links can be added in lgr */ }; struct { /* SMC-D */ struct smcd_gid peer_gid; /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ u8 peer_shutdown : 1; /* peer triggered shutdownn */ }; }; }; struct smc_clc_msg_local; #define GID_LIST_SIZE 2 struct smc_gidlist { u8 len; u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; }; struct smc_init_info_smcrv2 { /* Input fields */ __be32 saddr; struct sock *clc_sk; __be32 daddr; /* Output fields when saddr is set */ struct smc_ib_device *ib_dev_v2; u8 ib_port_v2; u8 ib_gid_v2[SMC_GID_SIZE]; /* Additional output fields when clc_sk and daddr is set as well */ u8 uses_gateway; u8 nexthop_mac[ETH_ALEN]; struct smc_gidlist gidlist; }; #define SMC_MAX_V2_ISM_DEVS SMCD_CLC_MAX_V2_GID_ENTRIES /* max # of proposed non-native ISM devices, * which can't exceed the max # of CHID-GID * entries in CLC proposal SMC-Dv2 extension. */ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; u8 release_nr; u8 max_conns; u8 max_links; u8 first_contact_peer; u8 first_contact_local; u16 feature_mask; unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ u8 smcr_version; u8 check_smcrv2; u8 peer_gid[SMC_GID_SIZE]; u8 peer_mac[ETH_ALEN]; u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ struct smcd_gid ism_peer_gid[SMC_MAX_V2_ISM_DEVS + 1]; struct smcd_dev *ism_dev[SMC_MAX_V2_ISM_DEVS + 1]; u16 ism_chid[SMC_MAX_V2_ISM_DEVS + 1]; u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; }; /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock * @token alert token to search for * @lgr link group to search in * Returns connection associated with token if found, NULL otherwise. */ static inline struct smc_connection *smc_lgr_find_conn( u32 token, struct smc_link_group *lgr) { struct smc_connection *res = NULL; struct rb_node *node; node = lgr->conns_all.rb_node; while (node) { struct smc_connection *cur = rb_entry(node, struct smc_connection, alert_node); if (cur->alert_token_local > token) { node = node->rb_left; } else { if (cur->alert_token_local < token) { node = node->rb_right; } else { res = cur; break; } } } return res; } static inline bool smc_conn_lgr_valid(struct smc_connection *conn) { return conn->lgr && conn->alert_token_local; } /* * Returns true if the specified link is usable. * * usable means the link is ready to receive RDMA messages, map memory * on the link, etc. This doesn't ensure we are able to send RDMA messages * on this link, if sending RDMA messages is needed, use smc_link_sendable() */ static inline bool smc_link_usable(struct smc_link *lnk) { if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) return false; return true; } /* * Returns true if the specified link is ready to receive AND send RDMA * messages. * * For the client side in first contact, the underlying QP may still in * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() * is not strong enough. For those places that need to send any CDC or LLC * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead */ static inline bool smc_link_sendable(struct smc_link *lnk) { return smc_link_usable(lnk) && lnk->qp_attr.cur_qp_state == IB_QPS_RTS; } static inline bool smc_link_active(struct smc_link *lnk) { return lnk->state == SMC_LNK_ACTIVE; } static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) { return lnk->wr_rx_sge_cnt > 1; } static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", be16_to_cpu(((__be16 *)gid_raw)[0]), be16_to_cpu(((__be16 *)gid_raw)[1]), be16_to_cpu(((__be16 *)gid_raw)[2]), be16_to_cpu(((__be16 *)gid_raw)[3]), be16_to_cpu(((__be16 *)gid_raw)[4]), be16_to_cpu(((__be16 *)gid_raw)[5]), be16_to_cpu(((__be16 *)gid_raw)[6]), be16_to_cpu(((__be16 *)gid_raw)[7])); } struct smc_pci_dev { __u32 pci_fid; __u16 pci_pchid; __u16 pci_vendor; __u16 pci_device; __u8 pci_id[SMC_PCI_ID_STR_LEN]; }; static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_pci_dev *smc_dev) { smc_dev->pci_vendor = pci_dev->vendor; smc_dev->pci_device = pci_dev->device; snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", pci_name(pci_dev)); #if IS_ENABLED(CONFIG_S390) { /* Set s390 specific PCI information */ struct zpci_dev *zdev; zdev = to_zpci(pci_dev); smc_dev->pci_fid = zdev->fid; smc_dev->pci_pchid = zdev->pchid; } #endif } struct smc_sock; struct smc_clc_msg_accept_confirm; void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); int smc_core_init(void); void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); void smcr_link_hold(struct smc_link *lnk); void smcr_link_put(struct smc_link *lnk); void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); void smcr_lgr_set_type_asym(struct smc_link_group *lgr, enum smc_lgr_type new_type, int asym_lnk_idx); int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); struct smc_link *smc_switch_conns(struct smc_link_group *lgr, struct smc_link *from_lnk, bool is_dev_err); void smcr_link_down_cond(struct smc_link *lnk); void smcr_link_down_cond_sched(struct smc_link *lnk); int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; } #endif
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 /* SPDX-License-Identifier: GPL-2.0 * * IO cost model based controller. * * Copyright (C) 2019 Tejun Heo <tj@kernel.org> * Copyright (C) 2019 Andy Newell <newella@fb.com> * Copyright (C) 2019 Facebook * * One challenge of controlling IO resources is the lack of trivially * observable cost metric. This is distinguished from CPU and memory where * wallclock time and the number of bytes can serve as accurate enough * approximations. * * Bandwidth and iops are the most commonly used metrics for IO devices but * depending on the type and specifics of the device, different IO patterns * easily lead to multiple orders of magnitude variations rendering them * useless for the purpose of IO capacity distribution. While on-device * time, with a lot of clutches, could serve as a useful approximation for * non-queued rotational devices, this is no longer viable with modern * devices, even the rotational ones. * * While there is no cost metric we can trivially observe, it isn't a * complete mystery. For example, on a rotational device, seek cost * dominates while a contiguous transfer contributes a smaller amount * proportional to the size. If we can characterize at least the relative * costs of these different types of IOs, it should be possible to * implement a reasonable work-conserving proportional IO resource * distribution. * * 1. IO Cost Model * * IO cost model estimates the cost of an IO given its basic parameters and * history (e.g. the end sector of the last IO). The cost is measured in * device time. If a given IO is estimated to cost 10ms, the device should * be able to process ~100 of those IOs in a second. * * Currently, there's only one builtin cost model - linear. Each IO is * classified as sequential or random and given a base cost accordingly. * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate * device-specific coefficients. * * 2. Control Strategy * * The device virtual time (vtime) is used as the primary control metric. * The control strategy is composed of the following three parts. * * 2-1. Vtime Distribution * * When a cgroup becomes active in terms of IOs, its hierarchical share is * calculated. Please consider the following hierarchy where the numbers * inside parentheses denote the configured weights. * * root * / \ * A (w:100) B (w:300) * / \ * A0 (w:100) A1 (w:100) * * If B is idle and only A0 and A1 are actively issuing IOs, as the two are * of equal weight, each gets 50% share. If then B starts issuing IOs, B * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) * against the device vtime - an IO which takes 10ms on the underlying * device is considered to take 80ms on A0. * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * * 2-2. Vrate Adjustment * * It's unrealistic to expect the cost model to be perfect. There are too * many devices and even on the same device the overall performance * fluctuates depending on numerous factors such as IO mixture and device * internal garbage collection. The controller needs to adapt dynamically. * * This is achieved by adjusting the overall IO rate according to how busy * the device is. If the device becomes overloaded, we're sending down too * many IOs and should generally slow down. If there are waiting issuers * but the device isn't saturated, we're issuing too few and should * generally speed up. * * To slow down, we lower the vrate - the rate at which the device vtime * passes compared to the wall clock. For example, if the vtime is running * at the vrate of 75%, all cgroups added up would only be able to issue * 750ms worth of IOs per second, and vice-versa for speeding up. * * Device business is determined using two criteria - rq wait and * completion latencies. * * When a device gets saturated, the on-device and then the request queues * fill up and a bio which is ready to be issued has to wait for a request * to become available. When this delay becomes noticeable, it's a clear * indication that the device is saturated and we lower the vrate. This * saturation signal is fairly conservative as it only triggers when both * hardware and software queues are filled up, and is used as the default * busy signal. * * As devices can have deep queues and be unfair in how the queued commands * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. * * The completion latency requirements are a function of both the * underlying device characteristics and the desired IO latency quality of * service. There is an inherent trade-off - the tighter the latency QoS, * the higher the bandwidth lossage. Latency QoS is disabled by default * and can be set through /sys/fs/cgroup/io.cost.qos. * * 2-3. Work Conservation * * Imagine two cgroups A and B with equal weights. A is issuing a small IO * periodically while B is sending out enough parallel IOs to saturate the * device on its own. Let's say A's usage amounts to 100ms worth of IO * cost per second, i.e., 10% of the device capacity. The naive * distribution of half and half would lead to 60% utilization of the * device, a significant reduction in the total amount of work done * compared to free-for-all competition. This is too high a cost to pay * for IO control. * * To conserve the total amount of work done, we keep track of how much * each active cgroup is actually using and yield part of its weight if * there are other cgroups which can make use of it. In the above case, * A's weight will be lowered so that it hovers above the actual usage and * B would be able to use the rest. * * As we don't want to penalize a cgroup for donating its weight, the * surplus weight adjustment factors in a margin and has an immediate * snapback mechanism in case the cgroup needs more IO vtime for itself. * * Note that adjusting down surplus weights has the same effects as * accelerating vtime for other cgroups and work conservation can also be * implemented by adjusting vrate dynamically. However, squaring who can * donate and should take back how much requires hweight propagations * anyway making it easier to implement and understand as a separate * mechanism. * * 3. Monitoring * * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 * * - per : Timer period * - cur_per : Internal wall and device vtime clock * - vrate : Device virtual time rate against wall clock * - weight : Surplus-adjusted and configured weights * - hweight : Surplus-adjusted and configured hierarchical weights * - inflt : The percentage of in-flight IO cost at the end of last period * - del_ms : Deferred issuer delay induction level and duration * - usages : Usage history */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> #include <asm/local.h> #include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" #include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ #define TRACE_IOCG_PATH_LEN 1024 static DEFINE_SPINLOCK(trace_iocg_path_lock); static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; #define TRACE_IOCG_PATH(type, iocg, ...) \ do { \ unsigned long flags; \ if (trace_iocost_##type##_enabled()) { \ spin_lock_irqsave(&trace_iocg_path_lock, flags); \ cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ trace_iocg_path, TRACE_IOCG_PATH_LEN); \ trace_iocost_##type(iocg, trace_iocg_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ } \ } while (0) #else /* CONFIG_TRACE_POINTS */ #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) #endif /* CONFIG_TRACE_POINTS */ enum { MILLION = 1000000, /* timer period is calculated from latency requirements, bound it */ MIN_PERIOD = USEC_PER_MSEC, MAX_PERIOD = USEC_PER_SEC, /* * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ MARGIN_MIN_PCT = 10, MARGIN_LOW_PCT = 20, MARGIN_TARGET_PCT = 50, INUSE_ADJ_STEP_PCT = 25, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, }; enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to * represent the cost of a single page worth of discard with * suffificient accuracy. At the same time, it should be able to * represent reasonably long enough durations to be useful and * convenient during operation. * * 1s worth of vtime is 2^37. This gives us both sub-nanosecond * granularity and days of wrap-around time even at extreme vrates. */ VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, /* bound vrate adjustments within two orders of magnitude */ VRATE_MIN_PPM = 10000, /* 1% */ VRATE_MAX_PPM = 100000000, /* 10000% */ VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, /* switch iff the conditions are met for longer than this */ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, }; enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, /* * The effect of delay is indirect and non-linear and a huge amount of * future debt can accumulate abruptly while unthrottled. Linearly scale * up delay as debt is going up and then let it decay exponentially. * This gives us quick ramp ups while delay is accumulating and long * tails which can help reducing the frequency of debt explosions on * unthrottle. The parameters are experimentally determined. * * The delay mechanism provides adequate protection and behavior in many * cases. However, this is far from ideal and falls shorts on both * fronts. The debtors are often throttled too harshly costing a * significant level of fairness and possibly total work while the * protection against their impacts on the system can be choppy and * unreliable. * * The shortcoming primarily stems from the fact that, unlike for page * cache, the kernel doesn't have well-defined back-pressure propagation * mechanism and policies for anonymous memory. Fully addressing this * issue will likely require substantial improvements in the area. */ MIN_DELAY_THR_PCT = 500, MAX_DELAY_THR_PCT = 25000, MIN_DELAY = 250, MAX_DELAY = 250 * USEC_PER_MSEC, /* halve debts if avg usage over 100ms is under 50% */ DFGV_USAGE_PCT = 50, DFGV_PERIOD = 100 * USEC_PER_MSEC, /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, /* * Count IO size in 4k pages. The 12bit shift helps keeping * size-proportional components of cost calculation in closer * numbers of digits to per-IO cost components. */ IOC_PAGE_SHIFT = 12, IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, /* if apart further than 16M, consider randio for linear model */ LCOEF_RANDIO_PAGES = 4096, }; enum ioc_running { IOC_IDLE, IOC_RUNNING, IOC_STOP, }; /* io.cost.qos controls including per-dev enable of the whole controller */ enum { QOS_ENABLE, QOS_CTRL, NR_QOS_CTRL_PARAMS, }; /* io.cost.qos params */ enum { QOS_RPPM, QOS_RLAT, QOS_WPPM, QOS_WLAT, QOS_MIN, QOS_MAX, NR_QOS_PARAMS, }; /* io.cost.model controls */ enum { COST_CTRL, COST_MODEL, NR_COST_CTRL_PARAMS, }; /* builtin linear cost model coefficients */ enum { I_LCOEF_RBPS, I_LCOEF_RSEQIOPS, I_LCOEF_RRANDIOPS, I_LCOEF_WBPS, I_LCOEF_WSEQIOPS, I_LCOEF_WRANDIOPS, NR_I_LCOEFS, }; enum { LCOEF_RPAGE, LCOEF_RSEQIO, LCOEF_RRANDIO, LCOEF_WPAGE, LCOEF_WSEQIO, LCOEF_WRANDIO, NR_LCOEFS, }; enum { AUTOP_INVALID, AUTOP_HDD, AUTOP_SSD_QD1, AUTOP_SSD_DFL, AUTOP_SSD_FAST, }; struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; u64 lcoefs[NR_LCOEFS]; u32 too_fast_vrate_pct; u32 too_slow_vrate_pct; }; struct ioc_margins { s64 min; s64 low; s64 target; }; struct ioc_missed { local_t nr_met; local_t nr_missed; u32 last_met; u32 last_missed; }; struct ioc_pcpu_stat { struct ioc_missed missed[2]; local64_t rq_wait_ns; u64 last_rq_wait_ns; }; /* per device */ struct ioc { struct rq_qos rqos; bool enabled; struct ioc_params params; struct ioc_margins margins; u32 period_us; u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; spinlock_t lock; struct timer_list timer; struct list_head active_iocgs; /* active cgroups */ struct ioc_pcpu_stat __percpu *pcpu_stat; enum ioc_running running; atomic64_t vtime_rate; u64 vtime_base_rate; s64 vtime_err; seqcount_spinlock_t period_seqcount; u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ /* debt forgivness */ u64 dfgv_period_at; u64 dfgv_period_rem; u64 dfgv_usage_us_sum; u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; bool user_qos_params:1; bool user_cost_model:1; }; struct iocg_pcpu_stat { local64_t abs_vusage; }; struct iocg_stat { u64 usage_us; u64 wait_us; u64 indebt_us; u64 indelay_us; }; /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; struct ioc *ioc; /* * A iocg can get its weight from two sources - an explicit * per-device-cgroup configuration or the default weight of the * cgroup. `cfg_weight` is the explicit per-device-cgroup * configuration. `weight` is the effective considering both * sources. * * When an idle cgroup becomes active its `active` goes from 0 to * `weight`. `inuse` is the surplus adjusted active weight. * `active` and `inuse` are used to calculate `hweight_active` and * `hweight_inuse`. * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. * * `inuse` may be adjusted dynamically during period. `saved_*` are used * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; u32 last_inuse; s64 saved_margin; sector_t cursor; /* to detect randio */ /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; /* current delay in effect and when it started */ u64 delay; u64 delay_at; /* * The period this iocg was last active in. Used for deactivation * and invalidating `vtime`. */ atomic64_t active_period; struct list_head active_list; /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; u32 hweight_donating; u32 hweight_after_donation; struct list_head walk_list; struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; /* timestamp at the latest activation */ u64 activated_at; /* statistics */ struct iocg_pcpu_stat __percpu *pcpu_stat; struct iocg_stat stat; struct iocg_stat last_stat; u64 last_stat_abs_vusage; u64 usage_delta_us; u64 wait_since; u64 indebt_since; u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; struct ioc_gq *ancestors[]; }; /* per cgroup */ struct ioc_cgrp { struct blkcg_policy_data cpd; unsigned int dfl_weight; }; struct ioc_now { u64 now_ns; u64 now; u64 vnow; }; struct iocg_wait { struct wait_queue_entry wait; struct bio *bio; u64 abs_cost; bool committed; }; struct iocg_wake_ctx { struct ioc_gq *iocg; u32 hw_inuse; s64 vbudget; }; static const struct ioc_params autop[] = { [AUTOP_HDD] = { .qos = { [QOS_RLAT] = 250000, /* 250ms */ [QOS_WLAT] = 250000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 174019176, [I_LCOEF_RSEQIOPS] = 41708, [I_LCOEF_RRANDIOPS] = 370, [I_LCOEF_WBPS] = 178075866, [I_LCOEF_WSEQIOPS] = 42705, [I_LCOEF_WRANDIOPS] = 378, }, }, [AUTOP_SSD_QD1] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 245855193, [I_LCOEF_RSEQIOPS] = 61575, [I_LCOEF_RRANDIOPS] = 6946, [I_LCOEF_WBPS] = 141365009, [I_LCOEF_WSEQIOPS] = 33716, [I_LCOEF_WRANDIOPS] = 26796, }, }, [AUTOP_SSD_DFL] = { .qos = { [QOS_RLAT] = 25000, /* 25ms */ [QOS_WLAT] = 25000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 488636629, [I_LCOEF_RSEQIOPS] = 8932, [I_LCOEF_RRANDIOPS] = 8518, [I_LCOEF_WBPS] = 427891549, [I_LCOEF_WSEQIOPS] = 28755, [I_LCOEF_WRANDIOPS] = 21940, }, .too_fast_vrate_pct = 500, }, [AUTOP_SSD_FAST] = { .qos = { [QOS_RLAT] = 5000, /* 5ms */ [QOS_WLAT] = 5000, [QOS_MIN] = VRATE_MIN_PPM, [QOS_MAX] = VRATE_MAX_PPM, }, .i_lcoefs = { [I_LCOEF_RBPS] = 3102524156LLU, [I_LCOEF_RSEQIOPS] = 724816, [I_LCOEF_RRANDIOPS] = 778122, [I_LCOEF_WBPS] = 1742780862LLU, [I_LCOEF_WSEQIOPS] = 425702, [I_LCOEF_WRANDIOPS] = 443193, }, .too_slow_vrate_pct = 10, }, }; /* * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; static struct blkcg_policy blkcg_policy_iocost; /* accessors and helpers */ static struct ioc *rqos_to_ioc(struct rq_qos *rqos) { return container_of(rqos, struct ioc, rqos); } static struct ioc *q_to_ioc(struct request_queue *q) { return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } static const char __maybe_unused *ioc_name(struct ioc *ioc) { struct gendisk *disk = ioc->rqos.disk; if (!disk) return "<unknown>"; return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioc_gq, pd) : NULL; } static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) { return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); } static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) { return pd_to_blkg(&iocg->pd); } static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), struct ioc_cgrp, cpd); } /* * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical * weight, the more expensive each IO. Must round up. */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* * The inverse of abs_cost_to_cost(). Must round up. */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 abs_cost, u64 cost) { struct iocg_pcpu_stat *gcs; bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) { if (lock_ioc) { spin_lock_irqsave(&iocg->ioc->lock, *flags); spin_lock(&iocg->waitq.lock); } else { spin_lock_irqsave(&iocg->waitq.lock, *flags); } } static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) { if (unlock_ioc) { spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&iocg->ioc->lock, *flags); } else { spin_unlock_irqrestore(&iocg->waitq.lock, *flags); } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> static void ioc_refresh_margins(struct ioc *ioc) { struct ioc_margins *margins = &ioc->margins; u32 period_us = ioc->period_us; u64 vrate = ioc->vtime_base_rate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; } /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { u32 ppm, lat, multi, period_us; lockdep_assert_held(&ioc->lock); /* pick the higher latency target */ if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { ppm = ioc->params.qos[QOS_RPPM]; lat = ioc->params.qos[QOS_RLAT]; } else { ppm = ioc->params.qos[QOS_WPPM]; lat = ioc->params.qos[QOS_WLAT]; } /* * We want the period to be long enough to contain a healthy number * of IOs while short enough for granular control. Define it as a * multiple of the latency target. Ideally, the multiplier should * be scaled according to the percentile so that it would nominally * contain a certain number of requests. Let's be simpler and * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). */ if (ppm) multi = max_t(u32, (MILLION - ppm) / 50000, 2); else multi = 2; period_us = multi * lat; period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); /* calculate dependent params */ ioc->period_us = period_us; ioc->timer_slack_ns = div64_u64( (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, 100); ioc_refresh_margins(ioc); } /* * ioc->rqos.disk isn't initialized when this function is called from * the init path. */ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; u32 vrate_pct; u64 now_ns; /* rotational? */ if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ if (idx < AUTOP_SSD_DFL) return AUTOP_SSD_DFL; /* if user is overriding anything, maintain what was there */ if (ioc->user_qos_params || ioc->user_cost_model) return idx; /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) ioc->autop_too_fast_at = now_ns; if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) return idx + 1; } else { ioc->autop_too_fast_at = 0; } if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { if (!ioc->autop_too_slow_at) ioc->autop_too_slow_at = now_ns; if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) return idx - 1; } else { ioc->autop_too_slow_at = 0; } return idx; } /* * Take the followings as input * * @bps maximum sequential throughput * @seqiops maximum sequential 4k iops * @randiops maximum random 4k iops * * and calculate the linear model cost coefficients. * * *@page per-page cost 1s / (@bps / 4096) * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) */ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, u64 *page, u64 *seqio, u64 *randio) { u64 v; *page = *seqio = *randio = 0; if (bps) { u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); if (bps_pages) *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); else *page = 1; } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); if (v > *page) *seqio = v - *page; } if (randiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); if (v > *page) *randio = v - *page; } } static void ioc_refresh_lcoefs(struct ioc *ioc) { u64 *u = ioc->params.i_lcoefs; u64 *c = ioc->params.lcoefs; calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } /* * struct gendisk is required as an argument because ioc->rqos.disk * is not properly initialized when called from the init path. */ static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) return false; if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); ioc->vtime_base_rate = VTIME_PER_USEC; } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; ioc->autop_too_slow_at = 0; if (!ioc->user_qos_params) memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); if (!ioc->user_cost_model) memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); ioc_refresh_period_us(ioc); ioc_refresh_lcoefs(ioc); ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * VTIME_PER_USEC, MILLION); return true; } static bool ioc_refresh_params(struct ioc *ioc, bool force) { return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); } /* * When an iocg accumulates too much vtime or gets deactivated, we throw away * some vtime, which lowers the overall device utilization. As the exact amount * which is being thrown away is known, we can compensate by accelerating the * vrate accordingly so that the extra vtime generated in the current period * matches what got lost. */ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) { s64 pleft = ioc->period_at + ioc->period_us - now->now; s64 vperiod = ioc->period_us * ioc->vtime_base_rate; s64 vcomp, vcomp_min, vcomp_max; lockdep_assert_held(&ioc->lock); /* we need some time left in this period */ if (pleft <= 0) goto done; /* * Calculate how much vrate should be adjusted to offset the error. * Limit the amount of adjustment and deduct the adjusted amount from * the error. */ vcomp = -div64_s64(ioc->vtime_err, pleft); vcomp_min = -(ioc->vtime_base_rate >> 1); vcomp_max = ioc->vtime_base_rate; vcomp = clamp(vcomp, vcomp_min, vcomp_max); ioc->vtime_err += vcomp * pleft; atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); done: /* bound how much error can accumulate */ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, int nr_lagging, int nr_shortages, int prev_busy_level, u32 *missed_ppm) { u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); return; } /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level * based adjustment. */ if (vrate < vrate_min) { vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); vrate = min(vrate, vrate_min); } else if (vrate > vrate_max) { vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); vrate = max(vrate, vrate_max); } else { int idx = min_t(int, abs(ioc->busy_level), ARRAY_SIZE(vrate_adj_pct) - 1); u32 adj_pct = vrate_adj_pct[idx]; if (ioc->busy_level > 0) adj_pct = 100 - adj_pct; else adj_pct = 100 + adj_pct; vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), vrate_min, vrate_max); } trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); ioc->vtime_base_rate = vrate; ioc_refresh_margins(ioc); } /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; u64 vrate; now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is * * vtime at period start + (wallclock time since the start) * vrate * * As a consistent snapshot of `period_at_vtime` and `period_at` is * needed, they're seqcount protected. */ do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) { WARN_ON_ONCE(ioc->running != IOC_RUNNING); write_seqcount_begin(&ioc->period_seqcount); ioc->period_at = now->now; ioc->period_at_vtime = now->vnow; write_seqcount_end(&ioc->period_seqcount); ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); add_timer(&ioc->timer); } /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level * weight sums and propagate upwards accordingly. If @save, the current margin * is saved to be used as reference for later inuse in-period adjustments. */ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; lockdep_assert_held(&ioc->lock); /* * For an active leaf node, its inuse shouldn't be zero or exceed * @active. An active internal node's inuse is solely determined by the * inuse to active ratio of its children regardless of @inuse. */ if (list_empty(&iocg->active_list) && iocg->child_active_sum) { inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { /* * It may be tempting to turn this into a clamp expression with * a lower limit of 1 but active may be 0, which cannot be used * as an upper limit in that situation. This expression allows * active to clamp inuse unless it is 0, in which case inuse * becomes 1. */ inuse = min(inuse, active) ?: 1; } iocg->last_inuse = iocg->inuse; if (save) iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); if (active == iocg->active && inuse == iocg->inuse) return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u32 parent_active = 0, parent_inuse = 0; /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); /* apply the updates */ child->active = active; child->inuse = inuse; /* * The delta between inuse and active sums indicates that * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { parent_active = parent->weight; parent_inuse = DIV64_U64_ROUND_UP( parent_active * parent->child_inuse_sum, parent->child_active_sum); } /* do we need to keep walking up? */ if (parent_active == parent->active && parent_inuse == parent->inuse) break; active = parent_active; inuse = parent_inuse; } ioc->weights_updated = true; } static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); if (ioc->weights_updated) { /* paired with rmb in current_hweight(), see there */ smp_wmb(); atomic_inc(&ioc->hweight_gen); ioc->weights_updated = false; } } static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, bool save, struct ioc_now *now) { __propagate_weights(iocg, active, inuse, save, now); commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) { struct ioc *ioc = iocg->ioc; int lvl; u32 hwa, hwi; int ioc_gen; /* hot path - if uptodate, use cached */ ioc_gen = atomic_read(&ioc->hweight_gen); if (ioc_gen == iocg->hweight_gen) goto out; /* * Paired with wmb in commit_weights(). If we saw the updated * hweight_gen, all the weight updates from __propagate_weights() are * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future * reader will recalculate and we're guaranteed to discard the * wrong result soon. */ smp_rmb(); hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; u64 active_sum = READ_ONCE(parent->child_active_sum); u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); /* we can race with deactivations and either may read as zero */ if (!active_sum || !inuse_sum) continue; active_sum = max_t(u64, active, active_sum); hwa = div64_u64((u64)hwa * active, active_sum); inuse_sum = max_t(u64, inuse, inuse_sum); hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); iocg->hweight_inuse = max_t(u32, hwi, 1); iocg->hweight_gen = ioc_gen; out: if (hw_activep) *hw_activep = iocg->hweight_active; if (hw_inusep) *hw_inusep = iocg->hweight_inuse; } /* * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the * other weights stay unchanged. */ static u32 current_hweight_max(struct ioc_gq *iocg) { u32 hwm = WEIGHT_ONE; u32 inuse = iocg->active; u64 child_inuse_sum; int lvl; lockdep_assert_held(&iocg->ioc->lock); for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, parent->child_active_sum); } return max_t(u32, hwm, 1); } static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); u32 weight; lockdep_assert_held(&ioc->lock); weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; /* * If seem to be already active, just update the stamp to tell the * timer that we're still active. We don't mind occassional races. */ if (!list_empty(&iocg->active_list)) { ioc_now(ioc, now); cur_period = atomic64_read(&ioc->cur_period); if (atomic64_read(&iocg->active_period) != cur_period) atomic64_set(&iocg->active_period, cur_period); return true; } /* racy check on internal node IOs, treat as root level IOs */ if (iocg->child_active_sum) return false; spin_lock_irq(&ioc->lock); ioc_now(ioc, now); /* update period */ cur_period = atomic64_read(&ioc->cur_period); last_period = atomic64_read(&iocg->active_period); atomic64_set(&iocg->active_period, cur_period); /* already activated or breaking leaf-only constraint? */ if (!list_empty(&iocg->active_list)) goto succeed_unlock; for (i = iocg->level - 1; i > 0; i--) if (!list_empty(&iocg->ancestors[i]->active_list)) goto fail_unlock; if (iocg->child_active_sum) goto fail_unlock; /* * Always start with the target budget. On deactivation, we throw away * anything above it. */ vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); atomic64_add(vtarget - vtime, &iocg->vtime); atomic64_add(vtarget - vtime, &iocg->done_vtime); vtime = vtarget; /* * Activate, propagate weight and start period timer if not * running. Reset hweight_gen to avoid accidental match from * wrapping. */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); propagate_weights(iocg, iocg->weight, iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc_start_period(ioc, now); } succeed_unlock: spin_unlock_irq(&ioc->lock); return true; fail_unlock: spin_unlock_irq(&ioc->lock); return false; } static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); /* * If the delay is set by another CPU, we may be in the past. No need to * change anything if so. This avoids decay calculation underflow. */ if (time_before64(now->now, iocg->delay_at)) return false; /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; shift = div64_u64(tdelta, USEC_PER_SEC); if (iocg->delay && shift < BITS_PER_LONG) delay = iocg->delay >> shift; else delay = 0; /* calculate the new delay from the debt amount */ current_hweight(iocg, &hwa, NULL); vover = atomic64_read(&iocg->vtime) + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; vover_pct = div64_s64(100 * vover, ioc->period_us * ioc->vtime_base_rate); if (vover_pct <= MIN_DELAY_THR_PCT) new_delay = 0; else if (vover_pct >= MAX_DELAY_THR_PCT) new_delay = MAX_DELAY; else new_delay = MIN_DELAY + div_u64((MAX_DELAY - MIN_DELAY) * (vover_pct - MIN_DELAY_THR_PCT), MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); /* pick the higher one and apply */ if (new_delay > delay) { iocg->delay = new_delay; iocg->delay_at = now->now; delay = new_delay; } if (delay >= MIN_DELAY) { if (!iocg->indelay_since) iocg->indelay_since = now->now; blkcg_set_delay(blkg, delay * NSEC_PER_USEC); return true; } else { if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = 0; } iocg->delay = 0; blkcg_clear_delay(blkg); return false; } } static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, struct ioc_now *now) { struct iocg_pcpu_stat *gcs; lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); WARN_ON_ONCE(list_empty(&iocg->active_list)); /* * Once in debt, debt handling owns inuse. @iocg stays at the minimum * inuse donating all of it share to others until its debt is paid off. */ if (!iocg->abs_vdebt && abs_cost) { iocg->indebt_since = now->now; propagate_weights(iocg, iocg->active, 0, false, now); } iocg->abs_vdebt += abs_cost; gcs = get_cpu_ptr(iocg->pcpu_stat); local64_add(abs_cost, &gcs->abs_vusage); put_cpu_ptr(gcs); } static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, struct ioc_now *now) { lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); /* * make sure that nobody messed with @iocg. Check iocg->pd.online * to avoid warn when removing blkcg or disk. */ WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); /* if debt is paid in full, restore inuse */ if (!iocg->abs_vdebt) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = 0; propagate_weights(iocg, iocg->active, iocg->last_inuse, false, now); } } static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; if (ctx->vbudget < 0) return -1; iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it * actually changed the task state. We want the wait always removed. * Remove explicitly and use default_wake_function(). Note that the * order of operations is important as finish_wait() tests whether * @wq_entry is removed without grabbing the lock. */ default_wake_function(wq_entry, mode, flags, key); list_del_init_careful(&wq_entry->entry); return 0; } /* * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in * addition to iocg->waitq.lock. */ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 vshortage, expires, oexpires; s64 vbudget; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ if (pay_debt && iocg->abs_vdebt && vbudget > 0) { u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); u64 vpay = abs_cost_to_cost(abs_vpay, hwa); lockdep_assert_held(&ioc->lock); atomic64_add(vpay, &iocg->vtime); atomic64_add(vpay, &iocg->done_vtime); iocg_pay_debt(iocg, abs_vpay, now); vbudget -= vpay; } if (iocg->abs_vdebt || iocg->delay) iocg_kick_delay(iocg, now); /* * Debt can still be outstanding if we haven't paid all yet or the * caller raced and called without @pay_debt. Shouldn't wake up waiters * under debt. Make sure @vbudget reflects the outstanding amount and is * not positive. */ if (iocg->abs_vdebt) { s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); vbudget = min_t(s64, 0, vbudget - vdebt); } /* * Wake up the ones which are due and see how much vtime we'll need for * the next one. As paying off debt restores hw_inuse, it must be read * after the above debt payment. */ ctx.vbudget = vbudget; current_hweight(iocg, NULL, &ctx.hw_inuse); __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); if (!waitqueue_active(&iocg->waitq)) { if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = 0; } return; } if (!iocg->wait_since) iocg->wait_since = now->now; if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * NSEC_PER_USEC; expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); iocg_lock(iocg, pay_debt, &flags); iocg_kick_waitq(iocg, pay_debt, &now); iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) { u32 nr_met[2] = { }; u32 nr_missed[2] = { }; u64 rq_wait_ns = 0; int cpu, rw; for_each_online_cpu(cpu) { struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { u32 this_met = local_read(&stat->missed[rw].nr_met); u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; stat->missed[rw].last_met = this_met; stat->missed[rw].last_missed = this_missed; } this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } for (rw = READ; rw <= WRITE; rw++) { if (nr_met[rw] + nr_missed[rw]) missed_ppm_ar[rw] = DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, nr_met[rw] + nr_missed[rw]); else missed_ppm_ar[rw] = 0; } *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, ioc->period_us * NSEC_PER_USEC); } /* was iocg idle this period? */ static bool iocg_is_idle(struct ioc_gq *iocg) { struct ioc *ioc = iocg->ioc; /* did something get issued this period? */ if (atomic64_read(&iocg->active_period) == atomic64_read(&ioc->cur_period)) return false; /* is something in flight? */ if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) return false; return true; } /* * Call this function on the target leaf @iocg's to build pre-order traversal * list of all the ancestors in @inner_walk. The inner nodes are linked through * ->walk_list and the caller is responsible for dissolving the list after use. */ static void iocg_build_inner_walk(struct ioc_gq *iocg, struct list_head *inner_walk) { int lvl; WARN_ON_ONCE(!list_empty(&iocg->walk_list)); /* find the first ancestor which hasn't been visited yet */ for (lvl = iocg->level - 1; lvl >= 0; lvl--) { if (!list_empty(&iocg->ancestors[lvl]->walk_list)) break; } /* walk down and visit the inner nodes to get pre-order traversal */ while (++lvl <= iocg->level - 1) { struct ioc_gq *inner = iocg->ancestors[lvl]; /* record traversal order */ list_add_tail(&inner->walk_list, inner_walk); } } /* propagate the deltas to the parent */ static void iocg_flush_stat_upward(struct ioc_gq *iocg) { if (iocg->level > 0) { struct iocg_stat *parent_stat = &iocg->ancestors[iocg->level - 1]->stat; parent_stat->usage_us += iocg->stat.usage_us - iocg->last_stat.usage_us; parent_stat->wait_us += iocg->stat.wait_us - iocg->last_stat.wait_us; parent_stat->indebt_us += iocg->stat.indebt_us - iocg->last_stat.indebt_us; parent_stat->indelay_us += iocg->stat.indelay_us - iocg->last_stat.indelay_us; } iocg->last_stat = iocg->stat; } /* collect per-cpu counters and propagate the deltas to the parent */ static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 abs_vusage = 0; u64 vusage_delta; int cpu; lockdep_assert_held(&iocg->ioc->lock); /* collect per-cpu counters */ for_each_possible_cpu(cpu) { abs_vusage += local64_read( per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); } vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->stat.usage_us += iocg->usage_delta_us; iocg_flush_stat_upward(iocg); } /* get stat counters ready for reading on all active iocgs */ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) { LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg; /* flush leaves and build inner node walk list */ list_for_each_entry(iocg, target_iocgs, active_list) { iocg_flush_stat_leaf(iocg, now); iocg_build_inner_walk(iocg, &inner_walk); } /* keep flushing upwards by walking the inner list backwards */ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { iocg_flush_stat_upward(iocg); list_del_init(&iocg->walk_list); } } /* * Determine what @iocg's hweight_inuse should be after donating unused * capacity. @hwm is the upper bound and used to signal no donation. This * function also throws away @iocg's excess budget. */ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, u32 usage, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); s64 excess, delta, target, new_hwi; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return 1; /* see whether minimum margin requirement is met */ if (waitqueue_active(&iocg->waitq) || time_after64(vtime, now->vnow - ioc->margins.min)) return hwm; /* throw away excess above target */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { atomic64_add(excess, &iocg->vtime); atomic64_add(excess, &iocg->done_vtime); vtime += excess; ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } /* * Let's say the distance between iocg's and device's vtimes as a * fraction of period duration is delta. Assuming that the iocg will * consume the usage determined above, we want to determine new_hwi so * that delta equals MARGIN_TARGET at the end of the next period. * * We need to execute usage worth of IOs while spending the sum of the * new budget (1 - MARGIN_TARGET) and the leftover from the last period * (delta): * * usage = (1 - MARGIN_TARGET + delta) * new_hwi * * Therefore, the new_hwi is: * * new_hwi = usage / (1 - MARGIN_TARGET + delta) */ delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), now->vnow - ioc->period_at_vtime); target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); return clamp_t(s64, new_hwi, 1, hwm); } /* * For work-conservation, an iocg which isn't using all of its share should * donate the leftover to other iocgs. There are two ways to achieve this - 1. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. * * #1 is mathematically simpler but has the drawback of requiring synchronous * global hweight_inuse updates when idle iocg's get activated or inuse weights * change due to donation snapbacks as it has the possibility of grossly * overshooting what's allowed by the model and vrate. * * #2 is inherently safe with local operations. The donating iocg can easily * snap back to higher weights when needed without worrying about impacts on * other nodes as the impacts will be inherently correct. This also makes idle * iocg activations safe. The only effect activations have is decreasing * hweight_inuse of others, the right solution to which is for those iocgs to * snap back to higher weights. * * So, we go with #2. The challenge is calculating how each donating iocg's * inuse should be adjusted to achieve the target donation amounts. This is done * using Andy's method described in the following pdf. * * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo * * Given the weights and target after-donation hweight_inuse values, Andy's * method determines how the proportional distribution should look like at each * sibling level to maintain the relative relationship between all non-donating * pairs. To roughly summarize, it divides the tree into donating and * non-donating parts, calculates global donation rate which is used to * determine the target hweight_inuse for each node, and then derives per-level * proportions. * * The following pdf shows that global distribution calculated this way can be * achieved by scaling inuse weights of donating leaves and propagating the * adjustments upwards proportionally. * * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE * * Combining the above two, we can determine how each leaf iocg's inuse should * be adjusted to achieve the target donation. * * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN * * The inline comments use symbols from the last pdf. * * b is the sum of the absolute budgets in the subtree. 1 for the root node. * f is the sum of the absolute budgets of non-donating nodes in the subtree. * t is the sum of the absolute budgets of donating nodes in the subtree. * w is the weight of the node. w = w_f + w_t * w_f is the non-donating portion of w. w_f = w * f / b * w_b is the donating portion of w. w_t = w * t / b * s is the sum of all sibling weights. s = Sum(w) for siblings * s_f and s_t are the non-donating and donating portions of s. * * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. * w_pt is the donating portion of the parent's weight and w'_pt the same value * after adjustments. Subscript r denotes the root node's values. */ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) { LIST_HEAD(over_hwa); LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg, *root_iocg; u32 after_sum, over_sum, over_target, gamma; /* * It's pretty unlikely but possible for the total sum of * hweight_after_donation's to be higher than WEIGHT_ONE, which will * confuse the following calculations. If such condition is detected, * scale down everyone over its full share equally to keep the sum below * WEIGHT_ONE. */ after_sum = 0; over_sum = 0; list_for_each_entry(iocg, surpluses, surplus_list) { u32 hwa; current_hweight(iocg, &hwa, NULL); after_sum += iocg->hweight_after_donation; if (iocg->hweight_after_donation > hwa) { over_sum += iocg->hweight_after_donation; list_add(&iocg->walk_list, &over_hwa); } } if (after_sum >= WEIGHT_ONE) { /* * The delta should be deducted from the over_sum, calculate * target over_sum value. */ u32 over_delta = after_sum - (WEIGHT_ONE - 1); WARN_ON_ONCE(over_sum <= over_delta); over_target = over_sum - over_delta; } else { over_target = 0; } list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { if (over_target) iocg->hweight_after_donation = div_u64((u64)iocg->hweight_after_donation * over_target, over_sum); list_del_init(&iocg->walk_list); } /* * Build pre-order inner node walk list and prepare for donation * adjustment calculations. */ list_for_each_entry(iocg, surpluses, surplus_list) { iocg_build_inner_walk(iocg, &inner_walk); } root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); WARN_ON_ONCE(root_iocg->level > 0); list_for_each_entry(iocg, &inner_walk, walk_list) { iocg->child_adjusted_sum = 0; iocg->hweight_donating = 0; iocg->hweight_after_donation = 0; } /* * Propagate the donating budget (b_t) and after donation budget (b'_t) * up the hierarchy. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { if (iocg->level > 0) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; parent->hweight_donating += iocg->hweight_donating; parent->hweight_after_donation += iocg->hweight_after_donation; } } /* * Calculate inner hwa's (b) and make sure the donation values are * within the accepted ranges as we're doing low res calculations with * roundups. */ list_for_each_entry(iocg, &inner_walk, walk_list) { if (iocg->level) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; iocg->hweight_active = DIV64_U64_ROUND_UP( (u64)parent->hweight_active * iocg->active, parent->child_active_sum); } iocg->hweight_donating = min(iocg->hweight_donating, iocg->hweight_active); iocg->hweight_after_donation = min(iocg->hweight_after_donation, iocg->hweight_donating - 1); if (WARN_ON_ONCE(iocg->hweight_active <= 1 || iocg->hweight_donating <= 1 || iocg->hweight_after_donation == 0)) { pr_warn("iocg: invalid donation weights in "); pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); pr_cont(": active=%u donating=%u after=%u\n", iocg->hweight_active, iocg->hweight_donating, iocg->hweight_after_donation); } } /* * Calculate the global donation rate (gamma) - the rate to adjust * non-donating budgets by. * * No need to use 64bit multiplication here as the first operand is * guaranteed to be smaller than WEIGHT_ONE (1<<16). * * We know that there are beneficiary nodes and the sum of the donating * hweights can't be whole; however, due to the round-ups during hweight * calculations, root_iocg->hweight_donating might still end up equal to * or greater than whole. Limit the range when calculating the divider. * * gamma = (1 - t_r') / (1 - t_r) */ gamma = DIV_ROUND_UP( (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); /* * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner * nodes. */ list_for_each_entry(iocg, &inner_walk, walk_list) { struct ioc_gq *parent; u32 inuse, wpt, wptp; u64 st, sf; if (iocg->level == 0) { /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), WEIGHT_ONE - iocg->hweight_after_donation); continue; } parent = iocg->ancestors[iocg->level - 1]; /* b' = gamma * b_f + b_t' */ iocg->hweight_inuse = DIV64_U64_ROUND_UP( (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), WEIGHT_ONE) + iocg->hweight_after_donation; /* w' = s' * b' / b'_p */ inuse = DIV64_U64_ROUND_UP( (u64)parent->child_adjusted_sum * iocg->hweight_inuse, parent->hweight_inuse); /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ st = DIV64_U64_ROUND_UP( iocg->child_active_sum * iocg->hweight_donating, iocg->hweight_active); sf = iocg->child_active_sum - st; wpt = DIV64_U64_ROUND_UP( (u64)iocg->active * iocg->hweight_donating, iocg->hweight_active); wptp = DIV64_U64_ROUND_UP( (u64)inuse * iocg->hweight_after_donation, iocg->hweight_inuse); iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); } /* * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and * we can finally determine leaf adjustments. */ list_for_each_entry(iocg, surpluses, surplus_list) { struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; u32 inuse; /* * In-debt iocgs participated in the donation calculation with * the minimum target hweight_inuse. Configuring inuse * accordingly would work fine but debt handling expects * @iocg->inuse stay at the minimum and we don't wanna * interfere. */ if (iocg->abs_vdebt) { WARN_ON_ONCE(iocg->inuse > 1); continue; } /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); TRACE_IOCG_PATH(inuse_transfer, iocg, now, iocg->inuse, inuse, iocg->hweight_inuse, iocg->hweight_after_donation); __propagate_weights(iocg, iocg->active, inuse, true, now); } /* walk list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) list_del_init(&iocg->walk_list); } /* * A low weight iocg can amass a large amount of debt, for example, when * anonymous memory gets reclaimed aggressively. If the system has a lot of * memory paired with a slow IO device, the debt can span multiple seconds or * more. If there are no other subsequent IO issuers, the in-debt iocg may end * up blocked paying its debt while the IO device is idle. * * The following protects against such cases. If the device has been * sufficiently idle for a while, the debts are halved and delays are * recalculated. */ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { ioc->dfgv_period_at = now->now; ioc->dfgv_period_rem = 0; ioc->dfgv_usage_us_sum = 0; return; } /* * Debtors can pass through a lot of writes choking the device and we * don't want to be forgiving debts while the device is struggling from * write bursts. If we're missing latency targets, consider the device * fully utilized. */ if (ioc->busy_level > 0) usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); ioc->dfgv_usage_us_sum += usage_us_sum; if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) return; /* * At least DFGV_PERIOD has passed since the last period. Calculate the * average usage and reset the period counters. */ dur = now->now - ioc->dfgv_period_at; usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); ioc->dfgv_period_at = now->now; ioc->dfgv_usage_us_sum = 0; /* if was too busy, reset everything */ if (usage_pct > DFGV_USAGE_PCT) { ioc->dfgv_period_rem = 0; return; } /* * Usage is lower than threshold. Let's forgive some debts. Debt * forgiveness runs off of the usual ioc timer but its period usually * doesn't match ioc's. Compensate the difference by performing the * reduction as many times as would fit in the duration since the last * run and carrying over the left-over duration in @ioc->dfgv_period_rem * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive * reductions is doubled. */ nr_cycles = dur + ioc->dfgv_period_rem; ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 __maybe_unused old_debt, __maybe_unused old_delay; if (!iocg->abs_vdebt && !iocg->delay) continue; spin_lock(&iocg->waitq.lock); old_debt = iocg->abs_vdebt; old_delay = iocg->delay; nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; if (iocg->delay) iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, old_debt, iocg->abs_vdebt, old_delay, iocg->delay); spin_unlock(&iocg->waitq.lock); } } /* * Check the active iocgs' state to avoid oversleeping and deactive * idle iocgs. * * Since waiters determine the sleep durations based on the vrate * they saw at the time of sleep, if vrate has increased, some * waiters could be sleeping for too long. Wake up tardy waiters * which should have woken up in the last period and expire idle * iocgs. */ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); /* flush wait and indebt stat deltas */ if (iocg->wait_since) { iocg->stat.wait_us += now->now - iocg->wait_since; iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->stat.indebt_us += now->now - iocg->indebt_since; iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->stat.indelay_us += now->now - iocg->indelay_since; iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ u64 vtime = atomic64_read(&iocg->vtime); s64 excess; /* * @iocg has been inactive for a full duration and will * have a high budget. Account anything above target as * error and throw away. On reactivation, it'll start * with the target budget. */ excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; current_hweight(iocg, NULL, &old_hwi); ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } TRACE_IOCG_PATH(iocg_idle, iocg, now, atomic64_read(&iocg->active_period), atomic64_read(&ioc->cur_period), vtime); __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } commit_weights(ioc); return nr_debtors; } static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; u32 ppm_rthr; u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; /* how were the latencies during the period? */ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); /* take care of active iocgs */ spin_lock_irq(&ioc->lock); ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; if (WARN_ON_ONCE(!period_vtime)) { spin_unlock_irq(&ioc->lock); return; } nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation * below needs updated usage stat. Let's bring stat up-to-date. */ iocg_flush_stat(&ioc->active_iocgs, &now); /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { u64 vdone, vtime, usage_us; u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent * iocgs from accumulating a large amount of budget. */ vdone = atomic64_read(&iocg->done_vtime); vtime = atomic64_read(&iocg->vtime); current_hweight(iocg, &hw_active, &hw_inuse); /* * Latency QoS detection doesn't account for IOs which are * in-flight for longer than a period. Detect them by * comparing vdone against period start. If lagging behind * IOs from past periods, don't increase vrate. */ if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && !atomic_read(&iocg_to_blkg(iocg)->use_delay) && time_after64(vtime, vdone) && time_after64(vtime, now.vnow - MAX_LAGGING_PERIODS * period_vtime) && time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; /* * Determine absolute usage factoring in in-flight IOs to avoid * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { u32 hwa, old_hwi, hwm, new_hwi, usage; u64 usage_dur; if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), ioc->vtime_base_rate); usage_us = max(usage_us, inflight_us); } /* convert to hweight based usage ratio */ if (time_after64(iocg->activated_at, ioc->period_at)) usage_dur = max_t(u64, now.now - iocg->activated_at, 1); else usage_dur = max_t(u64, now.now - ioc->period_at, 1); usage = clamp_t(u32, DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. * Determine the donation amount. */ current_hweight(iocg, &hwa, &old_hwi); hwm = current_hweight_max(iocg); new_hwi = hweight_after_donation(iocg, old_hwi, hwm, usage, &now); /* * Donation calculation assumes hweight_after_donation * to be positive, a condition that a donor w/ hwa < 2 * can't meet. Don't bother with donation if hwa is * below 2. It's not gonna make a meaningful difference * anyway. */ if (new_hwi < hwm && hwa >= 2) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else if (!iocg->abs_vdebt) { /* * @iocg doesn't have enough to donate. Reset * its inuse to active. * * Don't reset debtors as their inuse's are * owned by debt handling. This shouldn't affect * donation calculuation in any meaningful way * as @iocg doesn't have a meaningful amount of * share anyway. */ TRACE_IOCG_PATH(inuse_shortage, iocg, &now, iocg->inuse, iocg->active, iocg->hweight_inuse, new_hwi); __propagate_weights(iocg, iocg->active, iocg->active, true, &now); nr_shortages++; } } else { /* genuinely short on vtime */ nr_shortages++; } } if (!list_empty(&surpluses) && nr_shortages) transfer_surpluses(&surpluses, &now); commit_weights(ioc); /* surplus list should be dissolved after use */ list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing * and experiencing shortages but not surpluses, we're too stingy * and should increase vtime rate. */ prev_busy_level = ioc->busy_level; if (rq_wait_pct > RQ_WAIT_BUSY_PCT || missed_ppm[READ] > ppm_rthr || missed_ppm[WRITE] > ppm_wthr) { /* clearly missing QoS targets, slow down vrate */ ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { /* QoS targets are being met with >25% margin */ if (nr_shortages) { /* * We're throttling while the device has spare * capacity. If vrate was being slowed down, stop. */ ioc->busy_level = min(ioc->busy_level, 0); /* * If there are IOs spanning multiple periods, wait * them out before pushing the device harder. */ if (!nr_lagging) ioc->busy_level--; } else { /* * Nobody is being throttled and the users aren't * issuing enough IOs to saturate the device. We * simply don't know how close the device is to * saturation. Coast. */ ioc->busy_level = 0; } } else { /* inside the hysterisis margin, we're good */ ioc->busy_level = 0; } ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); /* * This period is done. Move onto the next one. If nothing's * going on with the device, stop the timer. */ atomic64_inc(&ioc->cur_period); if (ioc->running != IOC_STOP) { if (!list_empty(&ioc->active_iocgs)) { ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; ioc->vtime_err = 0; ioc->running = IOC_IDLE; } ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); } static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u64 abs_cost, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; /* debt handling owns inuse for debtors */ if (iocg->abs_vdebt) return cost; /* * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || iocg->inuse == iocg->active) return cost; spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { spin_unlock_irqrestore(&ioc->lock, flags); return cost; } /* * Bump up inuse till @abs_cost fits in the existing budget. * adj_step must be determined after acquiring ioc->lock - we might * have raced and lost to another thread for activation and could * be reading 0 iocg->active before ioc->lock which will lead to * infinite loop. */ new_inuse = iocg->inuse; adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); do { new_inuse = new_inuse + adj_step; propagate_weights(iocg, iocg->active, new_inuse, true, now); current_hweight(iocg, NULL, &hwi); cost = abs_cost_to_cost(abs_cost, hwi); } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); return cost; } static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { struct ioc *ioc = iocg->ioc; u64 coef_seqio, coef_randio, coef_page; u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); u64 seek_pages = 0; u64 cost = 0; /* Can't calculate cost for empty bio */ if (!bio->bi_iter.bi_size) goto out; switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; break; default: goto out; } if (iocg->cursor) { seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; } if (!is_merge) { if (seek_pages > LCOEF_RANDIO_PAGES) { cost += coef_randio; } else { cost += coef_seqio; } } cost += pages * coef_page; out: *costp = cost; } static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) { u64 cost; calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); return cost; } static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, u64 *costp) { unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; switch (req_op(rq)) { case REQ_OP_READ: *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; break; case REQ_OP_WRITE: *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; break; default: *costp = 0; } } static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) { u64 cost; calc_size_vtime_cost_builtin(rq, ioc, &cost); return cost; } static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; u64 abs_cost, cost, vtime; bool use_debt, ioc_locked; unsigned long flags; /* bypass IOs if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ abs_cost = calc_vtime_cost(bio, iocg, false); if (!abs_cost) return; if (!iocg_activate(iocg, &now)) return; iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The * tests are racy but the races aren't systemic - we only miss once * in a while which is fine. */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. This can be handled in two ways. IOs which may * cause priority inversions are punted to @ioc->aux_iocg and charged as * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine * whether debt handling is needed and acquire locks accordingly. */ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); retry_lock: iocg_lock(iocg, ioc_locked, &flags); /* * @iocg must stay activated for debt and waitq handling. Deactivation * is synchronized against both ioc->lock and waitq.lock and we won't * get deactivated as long as we're waiting or has debt, so we're good * if we're activated here. In the unlikely cases that we aren't, just * issue the IO. */ if (unlikely(list_empty(&iocg->active_list))) { iocg_unlock(iocg, ioc_locked, &flags); iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * We're over budget. If @bio has to be issued regardless, remember * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay * off the debt before waking more IOs. * * This way, the debt is continuously paid off each period with the * actual budget available to the cgroup. If we just wound vtime, we * would incorrectly use the current hw_inuse for the entire amount * which, for example, can lead to the cgroup staying blocked for a * long time even with substantially raised hw_inuse. * * An iocg with vdebt should stay online so that the timer can keep * deducting its vdebt and [de]activate use_delay mechanism * accordingly. We don't want to race against the timer trying to * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; } /* guarantee that iocgs w/ waiters have maximum inuse */ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { if (!ioc_locked) { iocg_unlock(iocg, false, &flags); ioc_locked = true; goto retry_lock; } propagate_weights(iocg, iocg->active, iocg->active, true, &now); } /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the * current vrate. vtime and hweight changes can make it too short * or too long. Each wait entry records the absolute cost it's * waiting for to allow re-evaluation using a custom wait entry. * * If too short, the timer simply reschedules itself. If too long, * the period timer will notice and trigger wakeups. * * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); iocg_kick_waitq(iocg, ioc_locked, &now); iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } /* waker already committed us, proceed */ finish_wait(&iocg->waitq, &wait.wait); } static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; u64 vtime, abs_cost, cost; unsigned long flags; /* bypass if disabled, still initializing, or for root cgroup */ if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); if (!abs_cost) return; ioc_now(ioc, &now); vtime = atomic64_read(&iocg->vtime); cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) iocg->cursor = bio_end; /* * Charge if there's enough vtime budget and the existing request has * cost assigned. */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* * Otherwise, account it as debt if @iocg is online, which it should * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ spin_lock_irqsave(&ioc->lock, flags); spin_lock(&iocg->waitq.lock); if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); } spin_unlock(&iocg->waitq.lock); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); if (iocg && bio->bi_iocost_cost) atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); } static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; break; case REQ_OP_WRITE: pidx = QOS_WLAT; rw = WRITE; break; default: return; } on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); ccs = get_cpu_ptr(ioc->pcpu_stat); if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) local_inc(&ccs->missed[rw].nr_met); else local_inc(&ccs->missed[rw].nr_missed); local64_add(rq_wait_ns, &ccs->rq_wait_ns); put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); spin_lock_irq(&ioc->lock); ioc_refresh_params(ioc, false); spin_unlock_irq(&ioc->lock); } static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; spin_unlock_irq(&ioc->lock); timer_shutdown_sync(&ioc->timer); free_percpu(ioc->pcpu_stat); kfree(ioc); } static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, .done = ioc_rqos_done, .queue_depth_changed = ioc_rqos_queue_depth_changed, .exit = ioc_rqos_exit, }; static int blk_iocost_init(struct gendisk *disk) { struct ioc *ioc; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) return -ENOMEM; ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); if (!ioc->pcpu_stat) { kfree(ioc); return -ENOMEM; } for_each_possible_cpu(cpu) { struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { local_set(&ccs->missed[i].nr_met, 0); local_set(&ccs->missed[i].nr_missed, 0); } local64_set(&ccs->rq_wait_ns, 0); } spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); /* * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; return 0; err_del_qos: rq_qos_del(&ioc->rqos); err_free_ioc: free_percpu(ioc->pcpu_stat); kfree(ioc); return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) { struct ioc_cgrp *iocc; iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); if (!iocc) return NULL; iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } static void ioc_cpd_free(struct blkcg_policy_data *cpd) { kfree(container_of(cpd, struct ioc_cgrp, cpd)); } static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, disk->node_id); if (!iocg) return NULL; iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); if (!iocg->pcpu_stat) { kfree(iocg); return NULL; } return &iocg->pd; } static void ioc_pd_init(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); struct ioc *ioc = q_to_ioc(blkg->q); struct ioc_now now; struct blkcg_gq *tblkg; unsigned long flags; ioc_now(ioc, &now); iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); INIT_LIST_HEAD(&iocg->walk_list); INIT_LIST_HEAD(&iocg->surplus_list); iocg->hweight_active = WEIGHT_ONE; iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->level = blkg->blkcg->css.cgroup->level; for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { struct ioc_gq *tiocg = blkg_to_iocg(tblkg); iocg->ancestors[tiocg->level] = tiocg; } spin_lock_irqsave(&ioc->lock, flags); weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_pd_free(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; unsigned long flags; if (ioc) { spin_lock_irqsave(&ioc->lock, flags); if (!list_empty(&iocg->active_list)) { struct ioc_now now; ioc_now(ioc, &now); propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } WARN_ON_ONCE(!list_empty(&iocg->walk_list)); WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); } free_percpu(iocg->pcpu_stat); kfree(iocg); } static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; if (!ioc->enabled) return; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", iocg->last_stat.wait_us, iocg->last_stat.indebt_us, iocg->last_stat.indelay_us); } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } static int ioc_weight_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; if (!strchr(buf, ':')) { struct blkcg_gq *blkg; if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) return -EINVAL; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); } } spin_unlock_irq(&blkcg->lock); return nbytes; } blkg_conf_init(&ctx, buf); ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) goto err; iocg = blkg_to_iocg(ctx.blkg); if (!strncmp(ctx.body, "default", 7)) { v = 0; } else { if (!sscanf(ctx.body, "%u", &v)) goto einval; if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) goto einval; } spin_lock(&iocg->ioc->lock); iocg->cfg_weight = v * WEIGHT_ONE; ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); blkg_conf_exit(&ctx); return nbytes; einval: ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; if (!dname) return 0; spin_lock(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, ioc->params.qos[QOS_RPPM] % 10000 / 100, ioc->params.qos[QOS_RLAT], ioc->params.qos[QOS_WPPM] / 10000, ioc->params.qos[QOS_WPPM] % 10000 / 100, ioc->params.qos[QOS_WLAT], ioc->params.qos[QOS_MIN] / 10000, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); spin_unlock(&ioc->lock); return 0; } static int ioc_qos_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t qos_ctrl_tokens = { { QOS_ENABLE, "enable=%u" }, { QOS_CTRL, "ctrl=%s" }, { NR_QOS_CTRL_PARAMS, NULL }, }; static const match_table_t qos_tokens = { { QOS_RPPM, "rpct=%s" }, { QOS_RLAT, "rlat=%u" }, { QOS_WPPM, "wpct=%s" }, { QOS_WLAT, "wlat=%u" }, { QOS_MIN, "min=%s" }, { QOS_MAX, "max=%s" }, { NR_QOS_PARAMS, NULL }, }; static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; unsigned long memflags; int ret; blkg_conf_init(&ctx, input); memflags = blkg_conf_open_bdev_frozen(&ctx); if (IS_ERR_VALUE(memflags)) { ret = memflags; goto err; } body = ctx.body; disk = ctx.bdev->bd_disk; if (!queue_is_mq(disk->queue)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); if (ret) goto err; ioc = q_to_ioc(disk->queue); } blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; s64 v; if (!*p) continue; switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: if (match_u64(&args[0], &v)) goto einval; enable = v; continue; case QOS_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; } tok = match_token(p, qos_tokens, args); switch (tok) { case QOS_RPPM: case QOS_WPPM: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0 || v > 10000) goto einval; qos[tok] = v * 100; break; case QOS_RLAT: case QOS_WLAT: if (match_u64(&args[0], &v)) goto einval; qos[tok] = v; break; case QOS_MIN: case QOS_MAX: if (match_strlcpy(buf, &args[0], sizeof(buf)) >= sizeof(buf)) goto einval; if (cgroup_parse_float(buf, 2, &v)) goto einval; if (v < 0) goto einval; qos[tok] = clamp_t(s64, v * 100, VRATE_MIN_PPM, VRATE_MAX_PPM); break; default: goto einval; } user = true; } if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; if (enable && !ioc->enabled) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; } else if (!enable && ioc->enabled) { blk_stat_disable_accounting(disk->queue); blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } if (user) { memcpy(ioc->params.qos, qos, sizeof(qos)); ioc->user_qos_params = true; } else { ioc->user_qos_params = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); if (enable) wbt_disable_default(disk); else wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); ret = -EINVAL; err: blkg_conf_exit_frozen(&ctx, memflags); return ret; } static u64 ioc_cost_model_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { const char *dname = blkg_dev_name(pd->blkg); struct ioc *ioc = pd_to_iocg(pd)->ioc; u64 *u = ioc->params.i_lcoefs; if (!dname) return 0; spin_lock(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); spin_unlock(&ioc->lock); return 0; } static int ioc_cost_model_show(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; } static const match_table_t cost_ctrl_tokens = { { COST_CTRL, "ctrl=%s" }, { COST_MODEL, "model=%s" }, { NR_COST_CTRL_PARAMS, NULL }, }; static const match_table_t i_lcoef_tokens = { { I_LCOEF_RBPS, "rbps=%u" }, { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, { I_LCOEF_WBPS, "wbps=%u" }, { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, { NR_I_LCOEFS, NULL }, }; static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct blkg_conf_ctx ctx; struct request_queue *q; unsigned int memflags; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *body, *p; int ret; blkg_conf_init(&ctx, input); ret = blkg_conf_open_bdev(&ctx); if (ret) goto err; body = ctx.body; q = bdev_get_queue(ctx.bdev); if (!queue_is_mq(q)) { ret = -EOPNOTSUPP; goto err; } ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; ioc = q_to_ioc(q); } memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; u64 v; if (!*p) continue; switch (match_token(p, cost_ctrl_tokens, args)) { case COST_CTRL: match_strlcpy(buf, &args[0], sizeof(buf)); if (!strcmp(buf, "auto")) user = false; else if (!strcmp(buf, "user")) user = true; else goto einval; continue; case COST_MODEL: match_strlcpy(buf, &args[0], sizeof(buf)); if (strcmp(buf, "linear")) goto einval; continue; } tok = match_token(p, i_lcoef_tokens, args); if (tok == NR_I_LCOEFS) goto einval; if (match_u64(&args[0], &v)) goto einval; u[tok] = v; user = true; } if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; } else { ioc->user_cost_model = false; } ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q, memflags); blkg_conf_exit(&ctx); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q, memflags); ret = -EINVAL; err: blkg_conf_exit(&ctx); return ret; } static struct cftype ioc_files[] = { { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = ioc_weight_show, .write = ioc_weight_write, }, { .name = "cost.qos", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_qos_show, .write = ioc_qos_write, }, { .name = "cost.model", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = ioc_cost_model_show, .write = ioc_cost_model_write, }, {} }; static struct blkcg_policy blkcg_policy_iocost = { .dfl_cftypes = ioc_files, .cpd_alloc_fn = ioc_cpd_alloc, .cpd_free_fn = ioc_cpd_free, .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) { return blkcg_policy_register(&blkcg_policy_iocost); } static void __exit ioc_exit(void) { blkcg_policy_unregister(&blkcg_policy_iocost); } module_init(ioc_init); module_exit(ioc_exit);
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * TEA, XTEA, and XETA crypto alogrithms * * The TEA and Xtended TEA algorithms were developed by David Wheeler * and Roger Needham at the Computer Laboratory of Cambridge University. * * Due to the order of evaluation in XTEA many people have incorrectly * implemented it. XETA (XTEA in the wrong order), exists for * compatibility with these implementations. * * Copyright (c) 2004 Aaron Grothe ajgrothe@yahoo.com */ #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/unaligned.h> #include <linux/types.h> #define TEA_KEY_SIZE 16 #define TEA_BLOCK_SIZE 8 #define TEA_ROUNDS 32 #define TEA_DELTA 0x9e3779b9 #define XTEA_KEY_SIZE 16 #define XTEA_BLOCK_SIZE 8 #define XTEA_ROUNDS 32 #define XTEA_DELTA 0x9e3779b9 struct tea_ctx { u32 KEY[4]; }; struct xtea_ctx { u32 KEY[4]; }; static int tea_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct tea_ctx *ctx = crypto_tfm_ctx(tfm); ctx->KEY[0] = get_unaligned_le32(&in_key[0]); ctx->KEY[1] = get_unaligned_le32(&in_key[4]); ctx->KEY[2] = get_unaligned_le32(&in_key[8]); ctx->KEY[3] = get_unaligned_le32(&in_key[12]); return 0; } static void tea_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, n, sum = 0; u32 k0, k1, k2, k3; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); k0 = ctx->KEY[0]; k1 = ctx->KEY[1]; k2 = ctx->KEY[2]; k3 = ctx->KEY[3]; n = TEA_ROUNDS; while (n-- > 0) { sum += TEA_DELTA; y += ((z << 4) + k0) ^ (z + sum) ^ ((z >> 5) + k1); z += ((y << 4) + k2) ^ (y + sum) ^ ((y >> 5) + k3); } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void tea_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, n, sum; u32 k0, k1, k2, k3; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); k0 = ctx->KEY[0]; k1 = ctx->KEY[1]; k2 = ctx->KEY[2]; k3 = ctx->KEY[3]; sum = TEA_DELTA << 5; n = TEA_ROUNDS; while (n-- > 0) { z -= ((y << 4) + k2) ^ (y + sum) ^ ((y >> 5) + k3); y -= ((z << 4) + k0) ^ (z + sum) ^ ((z >> 5) + k1); sum -= TEA_DELTA; } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static int xtea_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct xtea_ctx *ctx = crypto_tfm_ctx(tfm); ctx->KEY[0] = get_unaligned_le32(&in_key[0]); ctx->KEY[1] = get_unaligned_le32(&in_key[4]); ctx->KEY[2] = get_unaligned_le32(&in_key[8]); ctx->KEY[3] = get_unaligned_le32(&in_key[12]); return 0; } static void xtea_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum = 0; u32 limit = XTEA_DELTA * XTEA_ROUNDS; struct xtea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); while (sum != limit) { y += ((z << 4 ^ z >> 5) + z) ^ (sum + ctx->KEY[sum&3]); sum += XTEA_DELTA; z += ((y << 4 ^ y >> 5) + y) ^ (sum + ctx->KEY[sum>>11 &3]); } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void xtea_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); sum = XTEA_DELTA * XTEA_ROUNDS; while (sum) { z -= ((y << 4 ^ y >> 5) + y) ^ (sum + ctx->KEY[sum>>11 & 3]); sum -= XTEA_DELTA; y -= ((z << 4 ^ z >> 5) + z) ^ (sum + ctx->KEY[sum & 3]); } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void xeta_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum = 0; u32 limit = XTEA_DELTA * XTEA_ROUNDS; struct xtea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); while (sum != limit) { y += (z << 4 ^ z >> 5) + (z ^ sum) + ctx->KEY[sum&3]; sum += XTEA_DELTA; z += (y << 4 ^ y >> 5) + (y ^ sum) + ctx->KEY[sum>>11 &3]; } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static void xeta_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { u32 y, z, sum; struct tea_ctx *ctx = crypto_tfm_ctx(tfm); y = get_unaligned_le32(&src[0]); z = get_unaligned_le32(&src[4]); sum = XTEA_DELTA * XTEA_ROUNDS; while (sum) { z -= (y << 4 ^ y >> 5) + (y ^ sum) + ctx->KEY[sum>>11 & 3]; sum -= XTEA_DELTA; y -= (z << 4 ^ z >> 5) + (z ^ sum) + ctx->KEY[sum & 3]; } put_unaligned_le32(y, &dst[0]); put_unaligned_le32(z, &dst[4]); } static struct crypto_alg tea_algs[3] = { { .cra_name = "tea", .cra_driver_name = "tea-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TEA_BLOCK_SIZE, .cra_ctxsize = sizeof (struct tea_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = TEA_KEY_SIZE, .cia_max_keysize = TEA_KEY_SIZE, .cia_setkey = tea_setkey, .cia_encrypt = tea_encrypt, .cia_decrypt = tea_decrypt } } }, { .cra_name = "xtea", .cra_driver_name = "xtea-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = XTEA_BLOCK_SIZE, .cra_ctxsize = sizeof (struct xtea_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = XTEA_KEY_SIZE, .cia_max_keysize = XTEA_KEY_SIZE, .cia_setkey = xtea_setkey, .cia_encrypt = xtea_encrypt, .cia_decrypt = xtea_decrypt } } }, { .cra_name = "xeta", .cra_driver_name = "xeta-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = XTEA_BLOCK_SIZE, .cra_ctxsize = sizeof (struct xtea_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = XTEA_KEY_SIZE, .cia_max_keysize = XTEA_KEY_SIZE, .cia_setkey = xtea_setkey, .cia_encrypt = xeta_encrypt, .cia_decrypt = xeta_decrypt } } } }; static int __init tea_mod_init(void) { return crypto_register_algs(tea_algs, ARRAY_SIZE(tea_algs)); } static void __exit tea_mod_fini(void) { crypto_unregister_algs(tea_algs, ARRAY_SIZE(tea_algs)); } MODULE_ALIAS_CRYPTO("tea"); MODULE_ALIAS_CRYPTO("xtea"); MODULE_ALIAS_CRYPTO("xeta"); module_init(tea_mod_init); module_exit(tea_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TEA, XTEA & XETA Cryptographic Algorithms");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RECIPROCAL_DIV_H #define _LINUX_RECIPROCAL_DIV_H #include <linux/types.h> /* * This algorithm is based on the paper "Division by Invariant * Integers Using Multiplication" by Torbjörn Granlund and Peter * L. Montgomery. * * The assembler implementation from Agner Fog, which this code is * based on, can be found here: * http://www.agner.org/optimize/asmlib.zip * * This optimization for A/B is helpful if the divisor B is mostly * runtime invariant. The reciprocal of B is calculated in the * slow-path with reciprocal_value(). The fast-path can then just use * a much faster multiplication operation with a variable dividend A * to calculate the division A/B. */ struct reciprocal_value { u32 m; u8 sh1, sh2; }; /* "reciprocal_value" and "reciprocal_divide" together implement the basic * version of the algorithm described in Figure 4.1 of the paper. */ struct reciprocal_value reciprocal_value(u32 d); static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) { u32 t = (u32)(((u64)a * R.m) >> 32); return (t + ((a - t) >> R.sh1)) >> R.sh2; } struct reciprocal_value_adv { u32 m; u8 sh, exp; bool is_wide_m; }; /* "reciprocal_value_adv" implements the advanced version of the algorithm * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The * exception case could be easily handled before calling "reciprocal_value_adv". * * The advanced version requires more complex calculation to get the reciprocal * multiplier and other control variables, but then could reduce the required * emulation operations. * * It makes no sense to use this advanced version for host divide emulation, * those extra complexities for calculating multiplier etc could completely * waive our saving on emulation operations. * * However, it makes sense to use it for JIT divide code generation for which * we are willing to trade performance of JITed code with that of host. As shown * by the following pseudo code, the required emulation operations could go down * from 6 (the basic version) to 3 or 4. * * To use the result of "reciprocal_value_adv", suppose we want to calculate * n/d, the pseudo C code will be: * * struct reciprocal_value_adv rvalue; * u8 pre_shift, exp; * * // handle exception case. * if (d >= (1U << 31)) { * result = n >= d; * return; * } * * rvalue = reciprocal_value_adv(d, 32) * exp = rvalue.exp; * if (rvalue.is_wide_m && !(d & 1)) { * // floor(log2(d & (2^32 -d))) * pre_shift = fls(d & -d) - 1; * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); * } else { * pre_shift = 0; * } * * // code generation starts. * if (imm == 1U << exp) { * result = n >> exp; * } else if (rvalue.is_wide_m) { * // pre_shift must be zero when reached here. * t = (n * rvalue.m) >> 32; * result = n - t; * result >>= 1; * result += t; * result >>= rvalue.sh - 1; * } else { * if (pre_shift) * result = n >> pre_shift; * result = ((u64)result * rvalue.m) >> 32; * result >>= rvalue.sh; * } */ struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); #endif /* _LINUX_RECIPROCAL_DIV_H */
109 109 109 109 25 107 109 110 110 109 108 110 108 26 109 108 110 109 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 // SPDX-License-Identifier: GPL-2.0 #include <linux/static_call.h> #include <linux/memory.h> #include <linux/bug.h> #include <asm/text-patching.h> enum insn_type { CALL = 0, /* site call */ NOP = 1, /* site cond-call */ JMP = 2, /* tramp / site tail-call */ RET = 3, /* tramp / site cond-tail-call */ JCC = 4, }; /* * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such * that there is no false-positive trampoline identification while also being a * speculation stop. */ static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; /* * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax */ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */ { u8 ret = 0; if (insn[0] == 0x0f) { u8 tmp = insn[1]; if ((tmp & 0xf0) == 0x80) ret = tmp; } return ret; } extern void __static_call_return(void); asm (".global __static_call_return\n\t" ".type __static_call_return, @function\n\t" ASM_FUNC_ALIGN "\n\t" "__static_call_return:\n\t" ANNOTATE_NOENDBR ANNOTATE_RETPOLINE_SAFE "ret; int3\n\t" ".size __static_call_return, . - __static_call_return \n\t"); static void __ref __static_call_transform(void *insn, enum insn_type type, void *func, bool modinit) { const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; u8 op, buf[6]; if ((type == JMP || type == RET) && (op = __is_Jcc(insn))) type = JCC; switch (type) { case CALL: func = callthunks_translate_call_dest(func); code = text_gen_insn(CALL_INSN_OPCODE, insn, func); if (func == &__static_call_return0) { emulate = code; code = &xor5rax; } break; case NOP: code = x86_nops[5]; break; case JMP: code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); break; case RET: if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; break; case JCC: if (!func) { func = __static_call_return; if (cpu_wants_rethunk()) func = x86_return_thunk; } buf[0] = 0x0f; __text_gen_insn(buf+1, op, insn+1, func, 5); code = buf; size = 6; break; } if (memcmp(insn, code, size) == 0) return; if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) { u8 opcode = insn[0]; if (tramp && memcmp(insn+5, tramp_ud, 3)) { pr_err("trampoline signature fail"); BUG(); } if (tail) { if (opcode == JMP32_INSN_OPCODE || opcode == RET_INSN_OPCODE || __is_Jcc(insn)) return; } else { if (opcode == CALL_INSN_OPCODE || !memcmp(insn, x86_nops[5], 5) || !memcmp(insn, xor5rax, 5)) return; } /* * If we ever trigger this, our text is corrupt, we'll probably not live long. */ pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); BUG(); } static inline enum insn_type __sc_insn(bool null, bool tail) { /* * Encode the following table without branches: * * tail null insn * -----+-------+------ * 0 | 0 | CALL * 0 | 1 | NOP * 1 | 0 | JMP * 1 | 1 | RET */ return 2*tail + null; } void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) { mutex_lock(&text_mutex); if (tramp && !site) { __static_call_validate(tramp, true, true); __static_call_transform(tramp, __sc_insn(!func, true), func, false); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { __static_call_validate(site, tail, false); __static_call_transform(site, __sc_insn(!func, tail), func, false); } mutex_unlock(&text_mutex); } EXPORT_SYMBOL_GPL(arch_static_call_transform); noinstr void __static_call_update_early(void *tramp, void *func) { BUG_ON(system_state != SYSTEM_BOOTING); BUG_ON(static_call_initialized); __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); sync_core(); } #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as * having a return trampoline. * * The problem is that static_call() is available before determining * X86_FEATURE_RETHUNK and, by implication, running alternatives. * * This means that __static_call_transform() above can have overwritten the * return trampoline and we now need to fix things up to be consistent. */ bool __static_call_fixup(void *tramp, u8 op, void *dest) { unsigned long addr = (unsigned long)tramp; /* * Not all .return_sites are a static_call trampoline (most are not). * Check if the 3 bytes after the return are still kernel text, if not, * then this definitely is not a trampoline and we need not worry * further. * * This avoids the memcmp() below tripping over pagefaults etc.. */ if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) && !kernel_text_address(addr + 7)) return false; if (memcmp(tramp+5, tramp_ud, 3)) { /* Not a trampoline site, not our problem. */ return false; } mutex_lock(&text_mutex); if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) __static_call_transform(tramp, RET, NULL, true); mutex_unlock(&text_mutex); return true; } #endif
19 2 2 2 2 19 19 19 19 19 1 1 1 19 19 19 2 1 20 20 19 20 3 3 3 1 3 4 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 // SPDX-License-Identifier: GPL-2.0-only /* * BSS client mode implementation * Copyright 2003-2008, Jouni Malinen <j@w1.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2025 Intel Corporation */ #include <linux/delay.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/moduleparam.h> #include <linux/rtnetlink.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/export.h> #include <net/mac80211.h> #include <linux/unaligned.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "led.h" #include "fils_aead.h" #include <kunit/static_stub.h> #define IEEE80211_AUTH_TIMEOUT (HZ / 5) #define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2) #define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) #define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 #define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS (100 * USEC_PER_MSEC) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 #define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) static int max_nullfunc_tries = 2; module_param(max_nullfunc_tries, int, 0644); MODULE_PARM_DESC(max_nullfunc_tries, "Maximum nullfunc tx tries before disconnecting (reason 4)."); static int max_probe_tries = 5; module_param(max_probe_tries, int, 0644); MODULE_PARM_DESC(max_probe_tries, "Maximum probe tries before disconnecting (reason 4)."); /* * Beacon loss timeout is calculated as N frames times the * advertised beacon interval. This may need to be somewhat * higher than what hardware might detect to account for * delays in the host processing frames. But since we also * probe on beacon miss before declaring the connection lost * default to what we want. */ static int beacon_loss_count = 7; module_param(beacon_loss_count, int, 0644); MODULE_PARM_DESC(beacon_loss_count, "Number of beacon intervals before we decide beacon was lost."); /* * Time the connection can be idle before we probe * it to see if we can still talk to the AP. */ #define IEEE80211_CONNECTION_IDLE_TIME (30 * HZ) /* * Time we wait for a probe response after sending * a probe request because of beacon loss or for * checking the connection still works. */ static int probe_wait_ms = 500; module_param(probe_wait_ms, int, 0644); MODULE_PARM_DESC(probe_wait_ms, "Maximum time(ms) to wait for probe response" " before disconnecting (reason 4)."); /* * How many Beacon frames need to have been used in average signal strength * before starting to indicate signal change events. */ #define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 /* * We can have multiple work items (and connection probing) * scheduling this timer, but we need to take care to only * reschedule it when it should fire _earlier_ than it was * asked for before, or if it's not pending right now. This * function ensures that. Note that it then is required to * run this function for all timeouts after the first one * has happened -- the work that runs from this timer will * do that. */ static void run_again(struct ieee80211_sub_if_data *sdata, unsigned long timeout) { lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!timer_pending(&sdata->u.mgd.timer) || time_before(timeout, sdata->u.mgd.timer.expires)) mod_timer(&sdata->u.mgd.timer, timeout); } void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.bcn_mon_timer, round_jiffies_up(jiffies + sdata->u.mgd.beacon_timeout)); } void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (unlikely(!ifmgd->associated)) return; if (ifmgd->probe_send_count) ifmgd->probe_send_count = 0; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } static int ecw2cw(int ecw) { return (1 << ecw) - 1; } static enum ieee80211_conn_mode ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 vht_cap_info, const struct ieee802_11_elems *elems, bool ignore_ht_channel_mismatch, const struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *chandef) { const struct ieee80211_ht_operation *ht_oper = elems->ht_operation; const struct ieee80211_vht_operation *vht_oper = elems->vht_operation; const struct ieee80211_he_operation *he_oper = elems->he_operation; const struct ieee80211_eht_operation *eht_oper = elems->eht_operation; struct ieee80211_supported_band *sband = sdata->local->hw.wiphy->bands[channel->band]; struct cfg80211_chan_def vht_chandef; bool no_vht = false; u32 ht_cfreq; if (ieee80211_hw_check(&sdata->local->hw, STRICT)) ignore_ht_channel_mismatch = false; *chandef = (struct cfg80211_chan_def) { .chan = channel, .width = NL80211_CHAN_WIDTH_20_NOHT, .center_freq1 = channel->center_freq, .freq1_offset = channel->freq_offset, }; /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, chandef)) { /* Fallback to default 1MHz */ chandef->width = NL80211_CHAN_WIDTH_1; chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; } /* get special 6 GHz case out of the way */ if (sband->band == NL80211_BAND_6GHZ) { enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT; /* this is an error */ if (conn->mode < IEEE80211_CONN_MODE_HE) return IEEE80211_CONN_MODE_LEGACY; if (!elems->he_6ghz_capa || !elems->he_cap) { sdata_info(sdata, "HE 6 GHz AP is missing HE/HE 6 GHz band capability\n"); return IEEE80211_CONN_MODE_LEGACY; } if (!eht_oper || !elems->eht_cap) { eht_oper = NULL; mode = IEEE80211_CONN_MODE_HE; } if (!ieee80211_chandef_he_6ghz_oper(sdata->local, he_oper, eht_oper, chandef)) { sdata_info(sdata, "bad HE/EHT 6 GHz operation\n"); return IEEE80211_CONN_MODE_LEGACY; } return mode; } /* now we have the progression HT, VHT, ... */ if (conn->mode < IEEE80211_CONN_MODE_HT) return IEEE80211_CONN_MODE_LEGACY; if (!ht_oper || !elems->ht_cap_elem) return IEEE80211_CONN_MODE_LEGACY; chandef->width = NL80211_CHAN_WIDTH_20; ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ if (!ignore_ht_channel_mismatch && channel->center_freq != ht_cfreq) { /* * It's possible that some APs are confused here; * Netgear WNDR3700 sometimes reports 4 higher than * the actual channel in association responses, but * since we look at probe response/beacon data here * it should be OK. */ sdata_info(sdata, "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", channel->center_freq, ht_cfreq, ht_oper->primary_chan, channel->band); return IEEE80211_CONN_MODE_LEGACY; } ieee80211_chandef_ht_oper(ht_oper, chandef); if (conn->mode < IEEE80211_CONN_MODE_VHT) return IEEE80211_CONN_MODE_HT; vht_chandef = *chandef; /* * having he_cap/he_oper parsed out implies we're at * least operating as HE STA */ if (elems->he_cap && he_oper && he_oper->he_oper_params & cpu_to_le32(IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { struct ieee80211_vht_operation he_oper_vht_cap; /* * Set only first 3 bytes (other 2 aren't used in * ieee80211_chandef_vht_oper() anyway) */ memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { sdata_info(sdata, "HE AP VHT information is invalid, disabling HE\n"); /* this will cause us to re-parse as VHT STA */ return IEEE80211_CONN_MODE_VHT; } } else if (!vht_oper || !elems->vht_cap_elem) { if (sband->band == NL80211_BAND_5GHZ) { sdata_info(sdata, "VHT information is missing, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } no_vht = true; } else if (sband->band == NL80211_BAND_2GHZ) { no_vht = true; } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, vht_oper, ht_oper, &vht_chandef)) { sdata_info(sdata, "AP VHT information is invalid, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { sdata_info(sdata, "AP VHT information doesn't match HT, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } *chandef = vht_chandef; /* stick to current max mode if we or the AP don't have HE */ if (conn->mode < IEEE80211_CONN_MODE_HE || !elems->he_operation || !elems->he_cap) { if (no_vht) return IEEE80211_CONN_MODE_HT; return IEEE80211_CONN_MODE_VHT; } /* stick to HE if we or the AP don't have EHT */ if (conn->mode < IEEE80211_CONN_MODE_EHT || !eht_oper || !elems->eht_cap) return IEEE80211_CONN_MODE_HE; /* * handle the case that the EHT operation indicates that it holds EHT * operation information (in case that the channel width differs from * the channel width reported in HT/VHT/HE). */ if (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) { struct cfg80211_chan_def eht_chandef = *chandef; ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &eht_chandef); eht_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); if (!cfg80211_chandef_valid(&eht_chandef)) { sdata_info(sdata, "AP EHT information is invalid, disabling EHT\n"); return IEEE80211_CONN_MODE_HE; } if (!cfg80211_chandef_compatible(chandef, &eht_chandef)) { sdata_info(sdata, "AP EHT information doesn't match HT/VHT/HE, disabling EHT\n"); return IEEE80211_CONN_MODE_HE; } *chandef = eht_chandef; } return IEEE80211_CONN_MODE_EHT; } static bool ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_operation *ht_op) { struct ieee80211_sta_ht_cap sta_ht_cap; int i; if (sband->band == NL80211_BAND_6GHZ) return true; if (!ht_op) return false; memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of an HT STA receives an MLME-JOIN.request primitive * with the SelectedBSS parameter containing a Basic HT-MCS Set field * in the HT Operation parameter that contains any unsupported MCSs, * the MLME response in the resulting MLME-JOIN.confirm primitive shall * contain a ResultCode parameter that is not set to the value SUCCESS. * ... */ /* Simply check that all basic rates are in the STA RX mask */ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { if ((ht_op->basic_set[i] & sta_ht_cap.mcs.rx_mask[i]) != ht_op->basic_set[i]) return false; } return true; } static bool ieee80211_verify_sta_vht_mcs_support(struct ieee80211_sub_if_data *sdata, int link_id, struct ieee80211_supported_band *sband, const struct ieee80211_vht_operation *vht_op) { struct ieee80211_sta_vht_cap sta_vht_cap; u16 ap_min_req_set, sta_rx_mcs_map, sta_tx_mcs_map; int nss; if (sband->band != NL80211_BAND_5GHZ) return true; if (!vht_op) return false; memcpy(&sta_vht_cap, &sband->vht_cap, sizeof(sta_vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &sta_vht_cap); ap_min_req_set = le16_to_cpu(vht_op->basic_mcs_set); sta_rx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.rx_mcs_map); sta_tx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.tx_mcs_map); /* * Many APs are incorrectly advertising an all-zero value here, * which really means MCS 0-7 are required for 1-8 streams, but * they don't really mean it that way. * Some other APs are incorrectly advertising 3 spatial streams * with MCS 0-7 are required, but don't really mean it that way * and we'll connect only with HT, rather than even HE. * As a result, unfortunately the VHT basic MCS/NSS set cannot * be used at all, so check it only in strict mode. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) return true; /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of a VHT STA receives an MLME-JOIN.request primitive * with a SelectedBSS parameter containing a Basic VHT-MCS And NSS Set * field in the VHT Operation parameter that contains any unsupported * <VHT-MCS, NSS> tuple, the MLME response in the resulting * MLME-JOIN.confirm primitive shall contain a ResultCode parameter * that is not set to the value SUCCESS. * ... */ for (nss = 8; nss > 0; nss--) { u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; u8 sta_rx_val; u8 sta_tx_val; if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; sta_rx_val = (sta_rx_mcs_map >> (2 * (nss - 1))) & 3; sta_tx_val = (sta_tx_mcs_map >> (2 * (nss - 1))) & 3; if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_rx_val < ap_op_val || sta_tx_val < ap_op_val) { link_id_info(sdata, link_id, "Missing mandatory rates for %d Nss, rx %d, tx %d oper %d, disable VHT\n", nss, sta_rx_val, sta_tx_val, ap_op_val); return false; } } return true; } static bool ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, int link_id, const struct ieee80211_he_cap_elem *he_cap, const struct ieee80211_he_operation *he_op) { struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; u16 mcs_80_map_tx, mcs_80_map_rx; u16 ap_min_req_set; int nss; if (!he_cap) return false; /* mcs_nss is right after he_cap info */ he_mcs_nss_supp = (void *)(he_cap + 1); mcs_80_map_tx = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); mcs_80_map_rx = le16_to_cpu(he_mcs_nss_supp->rx_mcs_80); /* P802.11-REVme/D0.3 * 27.1.1 Introduction to the HE PHY * ... * An HE STA shall support the following features: * ... * Single spatial stream HE-MCSs 0 to 7 (transmit and receive) in all * supported channel widths for HE SU PPDUs */ if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED || (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) { link_id_info(sdata, link_id, "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", mcs_80_map_tx, mcs_80_map_rx); return false; } if (!he_op) return true; ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* make sure the AP is consistent with itself * * P802.11-REVme/D0.3 * 26.17.1 Basic HE BSS operation * * A STA that is operating in an HE BSS shall be able to receive and * transmit at each of the <HE-MCS, NSS> tuple values indicated by the * Basic HE-MCS And NSS Set field of the HE Operation parameter of the * MLME-START.request primitive and shall be able to receive at each of * the <HE-MCS, NSS> tuple values indicated by the Supported HE-MCS and * NSS Set field in the HE Capabilities parameter of the MLMESTART.request * primitive */ for (nss = 8; nss > 0; nss--) { u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; u8 ap_rx_val; u8 ap_tx_val; if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; ap_rx_val = (mcs_80_map_rx >> (2 * (nss - 1))) & 3; ap_tx_val = (mcs_80_map_tx >> (2 * (nss - 1))) & 3; if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) { link_id_info(sdata, link_id, "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", nss, ap_rx_val, ap_tx_val, ap_op_val); return false; } } return true; } static bool ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_he_operation *he_op) { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); u16 ap_min_req_set; int i; if (!sta_he_cap || !he_op) return false; ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* Need to go over for 80MHz, 160MHz and for 80+80 */ for (i = 0; i < 3; i++) { const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = &sta_he_cap->he_mcs_nss_supp; u16 sta_mcs_map_rx = le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); u16 sta_mcs_map_tx = le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); u8 nss; bool verified = true; /* * For each band there is a maximum of 8 spatial streams * possible. Each of the sta_mcs_map_* is a 16-bit struct built * of 2 bits per NSS (1-8), with the values defined in enum * ieee80211_he_mcs_support. Need to make sure STA TX and RX * capabilities aren't less than the AP's minimum requirements * for this HE BSS per SS. * It is enough to find one such band that meets the reqs. */ for (nss = 8; nss > 0; nss--) { u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; /* * Make sure the HE AP doesn't require MCSs that aren't * supported by the client as required by spec * * P802.11-REVme/D0.3 * 26.17.1 Basic HE BSS operation * * An HE STA shall not attempt to join * (MLME-JOIN.request primitive) * a BSS, unless it supports (i.e., is able to both transmit and * receive using) all of the <HE-MCS, NSS> tuples in the basic * HE-MCS and NSS set. */ if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { verified = false; break; } } if (verified) return true; } /* If here, STA doesn't meet AP's HE min requirements */ return false; } static u8 ieee80211_get_eht_cap_mcs_nss(const struct ieee80211_sta_he_cap *sta_he_cap, const struct ieee80211_sta_eht_cap *sta_eht_cap, unsigned int idx, int bw) { u8 he_phy_cap0 = sta_he_cap->he_cap_elem.phy_cap_info[0]; u8 eht_phy_cap0 = sta_eht_cap->eht_cap_elem.phy_cap_info[0]; /* handle us being a 20 MHz-only EHT STA - with four values * for MCS 0-7, 8-9, 10-11, 12-13. */ if (!(he_phy_cap0 & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) return sta_eht_cap->eht_mcs_nss_supp.only_20mhz.rx_tx_max_nss[idx]; /* the others have MCS 0-9 together, rather than separately from 0-7 */ if (idx > 0) idx--; switch (bw) { case 0: return sta_eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_max_nss[idx]; case 1: if (!(he_phy_cap0 & (IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G))) return 0xff; /* pass check */ return sta_eht_cap->eht_mcs_nss_supp.bw._160.rx_tx_max_nss[idx]; case 2: if (!(eht_phy_cap0 & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)) return 0xff; /* pass check */ return sta_eht_cap->eht_mcs_nss_supp.bw._320.rx_tx_max_nss[idx]; } WARN_ON(1); return 0; } static bool ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_eht_operation *eht_op) { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *sta_eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_eht_mcs_nss_supp_20mhz_only *req; unsigned int i; if (!sta_he_cap || !sta_eht_cap || !eht_op) return false; req = &eht_op->basic_mcs_nss; for (i = 0; i < ARRAY_SIZE(req->rx_tx_max_nss); i++) { u8 req_rx_nss, req_tx_nss; unsigned int bw; req_rx_nss = u8_get_bits(req->rx_tx_max_nss[i], IEEE80211_EHT_MCS_NSS_RX); req_tx_nss = u8_get_bits(req->rx_tx_max_nss[i], IEEE80211_EHT_MCS_NSS_TX); for (bw = 0; bw < 3; bw++) { u8 have, have_rx_nss, have_tx_nss; have = ieee80211_get_eht_cap_mcs_nss(sta_he_cap, sta_eht_cap, i, bw); have_rx_nss = u8_get_bits(have, IEEE80211_EHT_MCS_NSS_RX); have_tx_nss = u8_get_bits(have, IEEE80211_EHT_MCS_NSS_TX); if (req_rx_nss > have_rx_nss || req_tx_nss > have_tx_nss) return false; } } return true; } static void ieee80211_get_rates(struct ieee80211_supported_band *sband, const u8 *supp_rates, unsigned int supp_rates_len, const u8 *ext_supp_rates, unsigned int ext_supp_rates_len, u32 *rates, u32 *basic_rates, unsigned long *unknown_rates_selectors, bool *have_higher_than_11mbit, int *min_rate, int *min_rate_index) { int i, j; for (i = 0; i < supp_rates_len + ext_supp_rates_len; i++) { u8 supp_rate = i < supp_rates_len ? supp_rates[i] : ext_supp_rates[i - supp_rates_len]; int rate = supp_rate & 0x7f; bool is_basic = !!(supp_rate & 0x80); if ((rate * 5) > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; /* * Skip membership selectors since they're not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ if (is_basic && rate >= BSS_MEMBERSHIP_SELECTOR_MIN) { if (unknown_rates_selectors) set_bit(rate, unknown_rates_selectors); continue; } for (j = 0; j < sband->n_bitrates; j++) { struct ieee80211_rate *br; int brate; br = &sband->bitrates[j]; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { if (rates) *rates |= BIT(j); if (is_basic && basic_rates) *basic_rates |= BIT(j); if (min_rate && (rate * 5) < *min_rate) { *min_rate = rate * 5; if (min_rate_index) *min_rate_index = j; } break; } } /* Handle an unknown entry as if it is an unknown selector */ if (is_basic && unknown_rates_selectors && j == sband->n_bitrates) set_bit(rate, unknown_rates_selectors); } } static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 prohibited_flags) { if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, prohibited_flags)) return false; if (chandef->punctured && ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) return false; return true; } static int ieee80211_chandef_num_subchans(const struct cfg80211_chan_def *c) { if (c->width == NL80211_CHAN_WIDTH_80P80) return 4 + 4; return cfg80211_chandef_get_width(c) / 20; } static int ieee80211_chandef_num_widths(const struct cfg80211_chan_def *c) { switch (c->width) { case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: return 1; case NL80211_CHAN_WIDTH_40: return 2; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: return 3; case NL80211_CHAN_WIDTH_160: return 4; case NL80211_CHAN_WIDTH_320: return 5; default: WARN_ON(1); return 0; } } VISIBLE_IF_MAC80211_KUNIT int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, u8 n_partial_subchans) { int n = ieee80211_chandef_num_subchans(ap); struct cfg80211_chan_def tmp = *ap; int offset = 0; /* * Given a chandef (in this context, it's the AP's) and a number * of subchannels that we want to look at ('n_partial_subchans'), * calculate the offset in number of subchannels between the full * and the subset with the desired width. */ /* same number of subchannels means no offset, obviously */ if (n == n_partial_subchans) return 0; /* don't WARN - misconfigured APs could cause this if their N > width */ if (n < n_partial_subchans) return 0; while (ieee80211_chandef_num_subchans(&tmp) > n_partial_subchans) { u32 prev = tmp.center_freq1; ieee80211_chandef_downgrade(&tmp, NULL); /* * if center_freq moved up, half the original channels * are gone now but were below, so increase offset */ if (prev < tmp.center_freq1) offset += ieee80211_chandef_num_subchans(&tmp); } /* * 80+80 with secondary 80 below primary - four subchannels for it * (we cannot downgrade *to* 80+80, so no need to consider 'tmp') */ if (ap->width == NL80211_CHAN_WIDTH_80P80 && ap->center_freq2 < ap->center_freq1) offset += 4; return offset; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_calc_chandef_subchan_offset); VISIBLE_IF_MAC80211_KUNIT void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used) { u8 needed = ieee80211_chandef_num_subchans(used); u8 have = ieee80211_chandef_num_subchans(ap); u8 tmp[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; u8 offset; if (!psd->valid) return; /* if N is zero, all defaults were used, no point in rearranging */ if (!psd->n) goto out; BUILD_BUG_ON(sizeof(tmp) != sizeof(psd->power)); /* * This assumes that 'N' is consistent with the HE channel, as * it should be (otherwise the AP is broken). * * In psd->power we have values in the order 0..N, 0..K, where * N+K should cover the entire channel per 'ap', but even if it * doesn't then we've pre-filled 'unlimited' as defaults. * * But this is all the wrong order, we want to have them in the * order of the 'used' channel. * * So for example, we could have a 320 MHz EHT AP, which has the * HE channel as 80 MHz (e.g. due to puncturing, which doesn't * seem to be considered for the TPE), as follows: * * EHT 320: | | | | | | | | | | | | | | | | | * HE 80: | | | | | * used 160: | | | | | | | | | * * N entries: |--|--|--|--| * K entries: |--|--|--|--|--|--|--|--| |--|--|--|--| * power idx: 4 5 6 7 8 9 10 11 0 1 2 3 12 13 14 15 * full chan: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * used chan: 0 1 2 3 4 5 6 7 * * The idx in the power array ('power idx') is like this since it * comes directly from the element's N and K entries in their * element order, and those are this way for HE compatibility. * * Rearrange them as desired here, first by putting them into the * 'full chan' order, and then selecting the necessary subset for * the 'used chan'. */ /* first reorder according to AP channel */ offset = ieee80211_calc_chandef_subchan_offset(ap, psd->n); for (int i = 0; i < have; i++) { if (i < offset) tmp[i] = psd->power[i + psd->n]; else if (i < offset + psd->n) tmp[i] = psd->power[i - offset]; else tmp[i] = psd->power[i]; } /* * and then select the subset for the used channel * (set everything to defaults first in case a driver is confused) */ memset(psd->power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(psd->power)); offset = ieee80211_calc_chandef_subchan_offset(ap, needed); for (int i = 0; i < needed; i++) psd->power[i] = tmp[offset + i]; out: /* limit, but don't lie if there are defaults in the data */ if (needed < psd->count) psd->count = needed; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_rearrange_tpe_psd); static void ieee80211_rearrange_tpe(struct ieee80211_parsed_tpe *tpe, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used) { /* ignore this completely for narrow/invalid channels */ if (!ieee80211_chandef_num_subchans(ap) || !ieee80211_chandef_num_subchans(used)) { ieee80211_clear_tpe(tpe); return; } for (int i = 0; i < 2; i++) { int needed_pwr_count; ieee80211_rearrange_tpe_psd(&tpe->psd_local[i], ap, used); ieee80211_rearrange_tpe_psd(&tpe->psd_reg_client[i], ap, used); /* limit this to the widths we actually need */ needed_pwr_count = ieee80211_chandef_num_widths(used); if (needed_pwr_count < tpe->max_local[i].count) tpe->max_local[i].count = needed_pwr_count; if (needed_pwr_count < tpe->max_reg_client[i].count) tpe->max_reg_client[i].count = needed_pwr_count; } } /* * The AP part of the channel request is used to distinguish settings * to the device used for wider bandwidth OFDMA. This is used in the * channel context code to assign two channel contexts even if they're * both for the same channel, if the AP bandwidths are incompatible. * If not EHT (or driver override) then ap.chan == NULL indicates that * there's no wider BW OFDMA used. */ static void ieee80211_set_chanreq_ap(struct ieee80211_sub_if_data *sdata, struct ieee80211_chan_req *chanreq, struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *ap_chandef) { chanreq->ap.chan = NULL; if (conn->mode < IEEE80211_CONN_MODE_EHT) return; if (sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW) return; chanreq->ap = *ap_chandef; } VISIBLE_IF_MAC80211_KUNIT struct ieee802_11_elems * ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_bss *cbss, int link_id, struct ieee80211_chan_req *chanreq, struct cfg80211_chan_def *ap_chandef, unsigned long *userspace_selectors) { const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies); struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_channel *channel = cbss->channel; struct ieee80211_elems_parse_params parse_params = { .link_id = -1, .from_ap = true, .start = ies->data, .len = ies->len, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; enum ieee80211_conn_mode ap_mode; unsigned long unknown_rates_selectors[BITS_TO_LONGS(128)] = {}; unsigned long sta_selectors[BITS_TO_LONGS(128)] = {}; int ret; again: parse_params.mode = conn->mode; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return ERR_PTR(-ENOMEM); ap_mode = ieee80211_determine_ap_chan(sdata, channel, bss->vht_cap_info, elems, false, conn, ap_chandef); /* this should be impossible since parsing depends on our mode */ if (WARN_ON(ap_mode > conn->mode)) { ret = -EINVAL; goto free; } if (conn->mode != ap_mode) { conn->mode = ap_mode; kfree(elems); goto again; } mlme_link_id_dbg(sdata, link_id, "determined AP %pM to be %s\n", cbss->bssid, ieee80211_conn_mode_str(ap_mode)); sband = sdata->local->hw.wiphy->bands[channel->band]; ieee80211_get_rates(sband, elems->supp_rates, elems->supp_rates_len, elems->ext_supp_rates, elems->ext_supp_rates_len, NULL, NULL, unknown_rates_selectors, NULL, NULL, NULL); switch (channel->band) { case NL80211_BAND_S1GHZ: if (WARN_ON(ap_mode != IEEE80211_CONN_MODE_S1G)) { ret = -EINVAL; goto free; } chanreq->oper = *ap_chandef; if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { ret = -EINVAL; goto free; } return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { link_id_info(sdata, link_id, "Rejecting non-HE 6/7 GHz connection"); ret = -EINVAL; goto free; } break; default: if (WARN_ON(ap_mode == IEEE80211_CONN_MODE_S1G)) { ret = -EINVAL; goto free; } } switch (ap_mode) { case IEEE80211_CONN_MODE_S1G: WARN_ON(1); ret = -EINVAL; goto free; case IEEE80211_CONN_MODE_LEGACY: conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case IEEE80211_CONN_MODE_HT: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_40); break; case IEEE80211_CONN_MODE_VHT: case IEEE80211_CONN_MODE_HE: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); break; case IEEE80211_CONN_MODE_EHT: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_320); break; } chanreq->oper = *ap_chandef; bitmap_copy(sta_selectors, userspace_selectors, 128); if (conn->mode >= IEEE80211_CONN_MODE_HT) set_bit(BSS_MEMBERSHIP_SELECTOR_HT_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_VHT) set_bit(BSS_MEMBERSHIP_SELECTOR_VHT_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_HE) set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_EHT) set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors); /* * We do not support EPD or GLK so never add them. * SAE_H2E is handled through userspace_selectors. */ /* Check if we support all required features */ if (!bitmap_subset(unknown_rates_selectors, sta_selectors, 128)) { link_id_info(sdata, link_id, "required basic rate or BSS membership selectors not supported or disabled, rejecting connection\n"); ret = -EINVAL; goto free; } ieee80211_set_chanreq_ap(sdata, chanreq, conn, ap_chandef); while (!ieee80211_chandef_usable(sdata, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) { ret = -EINVAL; goto free; } ieee80211_chanreq_downgrade(chanreq, conn); } if (conn->mode >= IEEE80211_CONN_MODE_HE && !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_NO_HE)) { conn->mode = IEEE80211_CONN_MODE_VHT; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_NO_EHT)) { conn->mode = IEEE80211_CONN_MODE_HE; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); } if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) link_id_info(sdata, link_id, "regulatory prevented using AP config, downgraded\n"); if (conn->mode >= IEEE80211_CONN_MODE_HT && !ieee80211_verify_sta_ht_mcs_support(sdata, sband, elems->ht_operation)) { conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; link_id_info(sdata, link_id, "required MCSes not supported, disabling HT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_VHT && !ieee80211_verify_sta_vht_mcs_support(sdata, link_id, sband, elems->vht_operation)) { conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_40); link_id_info(sdata, link_id, "required MCSes not supported, disabling VHT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_HE && (!ieee80211_verify_peer_he_mcs_support(sdata, link_id, (void *)elems->he_cap, elems->he_operation) || !ieee80211_verify_sta_he_mcs_support(sdata, sband, elems->he_operation))) { conn->mode = IEEE80211_CONN_MODE_VHT; link_id_info(sdata, link_id, "required MCSes not supported, disabling HE\n"); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && !ieee80211_verify_sta_eht_mcs_support(sdata, sband, elems->eht_operation)) { conn->mode = IEEE80211_CONN_MODE_HE; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); link_id_info(sdata, link_id, "required MCSes not supported, disabling EHT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && channel->band != NL80211_BAND_2GHZ && conn->bw_limit == IEEE80211_CONN_BW_LIMIT_40) { conn->mode = IEEE80211_CONN_MODE_HE; link_id_info(sdata, link_id, "required bandwidth not supported, disabling EHT\n"); } /* the mode can only decrease, so this must terminate */ if (ap_mode != conn->mode) { kfree(elems); goto again; } mlme_link_id_dbg(sdata, link_id, "connecting with %s mode, max bandwidth %d MHz\n", ieee80211_conn_mode_str(conn->mode), 20 * (1 << conn->bw_limit)); if (WARN_ON_ONCE(!cfg80211_chandef_valid(&chanreq->oper))) { ret = -EINVAL; goto free; } return elems; free: kfree(elems); return ERR_PTR(ret); } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_determine_chan_mode); static int ieee80211_config_bw(struct ieee80211_link_data *link, struct ieee802_11_elems *elems, bool update, u64 *changed, u16 stype) { struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; enum ieee80211_conn_mode ap_mode; const char *frame; u32 vht_cap_info = 0; u16 ht_opmode; int ret; switch (stype) { case IEEE80211_STYPE_BEACON: frame = "beacon"; break; case IEEE80211_STYPE_ASSOC_RESP: frame = "assoc response"; break; case IEEE80211_STYPE_REASSOC_RESP: frame = "reassoc response"; break; case IEEE80211_STYPE_ACTION: /* the only action frame that gets here */ frame = "ML reconf response"; break; default: return -EINVAL; } /* don't track any bandwidth changes in legacy/S1G modes */ if (link->u.mgd.conn.mode == IEEE80211_CONN_MODE_LEGACY || link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G) return 0; if (elems->vht_cap_elem) vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); ap_mode = ieee80211_determine_ap_chan(sdata, channel, vht_cap_info, elems, true, &link->u.mgd.conn, &ap_chandef); if (ap_mode != link->u.mgd.conn.mode) { link_info(link, "AP %pM appears to change mode (expected %s, found %s) in %s, disconnect\n", link->u.mgd.bssid, ieee80211_conn_mode_str(link->u.mgd.conn.mode), ieee80211_conn_mode_str(ap_mode), frame); return -EINVAL; } chanreq.oper = ap_chandef; ieee80211_set_chanreq_ap(sdata, &chanreq, &link->u.mgd.conn, &ap_chandef); /* * if HT operation mode changed store the new one - * this may be applicable even if channel is identical */ if (elems->ht_operation) { ht_opmode = le16_to_cpu(elems->ht_operation->operation_mode); if (link->conf->ht_operation_mode != ht_opmode) { *changed |= BSS_CHANGED_HT; link->conf->ht_operation_mode = ht_opmode; } } /* * Downgrade the new channel if we associated with restricted * bandwidth capabilities. For example, if we associated as a * 20 MHz STA to a 40 MHz AP (due to regulatory, capabilities * or config reasons) then switching to a 40 MHz channel now * won't do us any good -- we couldn't use it with the AP. */ while (link->u.mgd.conn.bw_limit < ieee80211_min_bw_limit_from_chandef(&chanreq.oper)) ieee80211_chandef_downgrade(&chanreq.oper, NULL); /* TPE element is not present in (re)assoc/ML reconfig response */ if (stype == IEEE80211_STYPE_BEACON && ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&elems->tpe, &ap_chandef, &chanreq.oper); if (memcmp(&link->conf->tpe, &elems->tpe, sizeof(elems->tpe))) { link->conf->tpe = elems->tpe; *changed |= BSS_CHANGED_TPE; } } if (ieee80211_chanreq_identical(&chanreq, &link->conf->chanreq)) return 0; link_info(link, "AP %pM changed bandwidth in %s, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", link->u.mgd.bssid, frame, chanreq.oper.chan->center_freq, chanreq.oper.chan->freq_offset, chanreq.oper.width, chanreq.oper.center_freq1, chanreq.oper.freq1_offset, chanreq.oper.center_freq2); if (!cfg80211_chandef_valid(&chanreq.oper)) { sdata_info(sdata, "AP %pM changed caps/bw in %s in a way we can't support - disconnect\n", link->u.mgd.bssid, frame); return -EINVAL; } if (!update) { link->conf->chanreq = chanreq; return 0; } /* * We're tracking the current AP here, so don't do any further checks * here. This keeps us from playing ping-pong with regulatory, without * it the following can happen (for example): * - connect to an AP with 80 MHz, world regdom allows 80 MHz * - AP advertises regdom US * - CRDA loads regdom US with 80 MHz prohibited (old database) * - we detect an unsupported channel and disconnect * - disconnect causes CRDA to reload world regdomain and the game * starts anew. * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881) * * It seems possible that there are still scenarios with CSA or real * bandwidth changes where a this could happen, but those cases are * less common and wouldn't completely prevent using the AP. */ ret = ieee80211_link_change_chanreq(link, &chanreq, changed); if (ret) { sdata_info(sdata, "AP %pM changed bandwidth in %s to incompatible one - disconnect\n", link->u.mgd.bssid, frame); return ret; } cfg80211_schedule_channels_check(&sdata->wdev); return 0; } /* frame sending functions */ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ap_ht_param, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, enum ieee80211_smps_mode smps, const struct ieee80211_conn_settings *conn) { u8 *pos; u32 flags = channel->flags; u16 cap; struct ieee80211_sta_ht_cap ht_cap; BUILD_BUG_ON(sizeof(ht_cap) != sizeof(sband->ht_cap)); memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); ieee80211_apply_htcap_overrides(sdata, &ht_cap); /* determine capability flags */ cap = ht_cap.cap; switch (ap_ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: if (flags & IEEE80211_CHAN_NO_HT40PLUS) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: if (flags & IEEE80211_CHAN_NO_HT40MINUS) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } break; } /* * If 40 MHz was disabled associate as though we weren't * capable of 40 MHz -- some broken APs will never fall * back to trying to transmit in 20 MHz. */ if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_20) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } /* set SM PS mode properly */ cap &= ~IEEE80211_HT_CAP_SM_PS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= WLAN_HT_CAP_SM_PS_DISABLED << IEEE80211_HT_CAP_SM_PS_SHIFT; break; case IEEE80211_SMPS_STATIC: cap |= WLAN_HT_CAP_SM_PS_STATIC << IEEE80211_HT_CAP_SM_PS_SHIFT; break; case IEEE80211_SMPS_DYNAMIC: cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << IEEE80211_HT_CAP_SM_PS_SHIFT; break; } /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); ieee80211_ie_build_ht_cap(pos, &ht_cap, cap); } /* This function determines vht capability flags for the association * and builds the IE. * Note - the function returns true to own the MU-MIMO capability */ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_supported_band *sband, struct ieee80211_vht_cap *ap_vht_cap, const struct ieee80211_conn_settings *conn) { struct ieee80211_local *local = sdata->local; u8 *pos; u32 cap; struct ieee80211_sta_vht_cap vht_cap; u32 mask, ap_bf_sts, our_bf_sts; bool mu_mimo_owner = false; BUILD_BUG_ON(sizeof(vht_cap) != sizeof(sband->vht_cap)); memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); /* determine capability flags */ cap = vht_cap.cap; if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_80) { cap &= ~IEEE80211_VHT_CAP_SHORT_GI_160; cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; } /* * Some APs apparently get confused if our capabilities are better * than theirs, so restrict what we advertise in the assoc request. */ if (!ieee80211_hw_check(&local->hw, STRICT)) { if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); else if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; } /* * If some other vif is using the MU-MIMO capability we cannot associate * using MU-MIMO - this will lead to contradictions in the group-id * mechanism. * Ownership is defined since association request, in order to avoid * simultaneous associations with MU-MIMO. */ if (cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) { bool disable_mu_mimo = false; struct ieee80211_sub_if_data *other; list_for_each_entry(other, &local->interfaces, list) { if (other->vif.bss_conf.mu_mimo_owner) { disable_mu_mimo = true; break; } } if (disable_mu_mimo) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; else mu_mimo_owner = true; } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; ap_bf_sts = le32_to_cpu(ap_vht_cap->vht_cap_info) & mask; our_bf_sts = cap & mask; if (ap_bf_sts < our_bf_sts) { cap &= ~mask; cap |= ap_bf_sts; } /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); return mu_mimo_owner; } static void ieee80211_assoc_add_rates(struct ieee80211_local *local, struct sk_buff *skb, enum nl80211_chan_width width, struct ieee80211_supported_band *sband, struct ieee80211_mgd_assoc_data *assoc_data) { u32 rates; if (assoc_data->supp_rates_len && !ieee80211_hw_check(&local->hw, STRICT)) { /* * Get all rates supported by the device and the AP as * some APs don't like getting a superset of their rates * in the association request (e.g. D-Link DAP 1353 in * b-only mode)... */ ieee80211_parse_bitrates(width, sband, assoc_data->supp_rates, assoc_data->supp_rates_len, &rates); } else { /* * In case AP not provide any supported rates information * before association, we send information element(s) with * all rates that we support. */ rates = ~0; } ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_SUPP_RATES); ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_EXT_SUPP_RATES); } static size_t ieee80211_add_before_ht_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { size_t noffset; static const u8 before_ht[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_EXT_SUPP_RATES, WLAN_EID_PWR_CAPABILITY, WLAN_EID_SUPPORTED_CHANNELS, WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_FAST_BSS_TRANSITION, /* reassoc only */ WLAN_EID_RIC_DATA, /* reassoc only */ WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; static const u8 after_ric[] = { WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_HT_CAPABILITY, WLAN_EID_BSS_COEX_2040, /* luckily this is almost always there */ WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ WLAN_EID_VHT_CAPABILITY, WLAN_EID_OPMODE_NOTIF, }; if (!elems_len) return offset; noffset = ieee80211_ie_split_ric(elems, elems_len, before_ht, ARRAY_SIZE(before_ht), after_ric, ARRAY_SIZE(after_ric), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_vht_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_vht[] = { /* * no need to list the ones split off before HT * or generated here */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; size_t noffset; if (!elems_len) return offset; /* RIC already taken care of in ieee80211_add_before_ht_elems() */ noffset = ieee80211_ie_split(elems, elems_len, before_vht, ARRAY_SIZE(before_vht), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_he_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_OPMODE_NOTIF, WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, /* 11ai elements */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, /* TODO: add 11ah/11aj/11ak elements */ }; size_t noffset; if (!elems_len) return offset; /* RIC already taken care of in ieee80211_add_before_ht_elems() */ noffset = ieee80211_ie_split(elems, elems_len, before_he, ARRAY_SIZE(before_he), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_reg_conn(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_reg_conn[] = { /* * no need to list the ones split off before HE * or generated here */ WLAN_EID_EXTENSION, WLAN_EID_EXT_DH_PARAMETER, WLAN_EID_EXTENSION, WLAN_EID_EXT_KNOWN_STA_IDENTIFCATION, }; size_t noffset; if (!elems_len) return offset; noffset = ieee80211_ie_split(elems, elems_len, before_reg_conn, ARRAY_SIZE(before_reg_conn), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } #define PRESENT_ELEMS_MAX 8 #define PRESENT_ELEM_EXT_OFFS 0x100 static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 capab, const struct element *ext_capa, const u16 *present_elems, struct ieee80211_mgd_assoc_data *assoc_data); static size_t ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 *capab, const struct element *ext_capa, const u8 *extra_elems, size_t extra_elems_len, unsigned int link_id, struct ieee80211_link_data *link, u16 *present_elems, struct ieee80211_mgd_assoc_data *assoc_data) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_channel *chan = cbss->channel; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20; struct ieee80211_chanctx_conf *chanctx_conf; enum ieee80211_smps_mode smps_mode; u16 orig_capab = *capab; size_t offset = 0; int present_elems_len = 0; u8 *pos; int i; #define ADD_PRESENT_ELEM(id) do { \ /* need a last for termination - we use 0 == SSID */ \ if (!WARN_ON(present_elems_len >= PRESENT_ELEMS_MAX - 1)) \ present_elems[present_elems_len++] = (id); \ } while (0) #define ADD_PRESENT_EXT_ELEM(id) ADD_PRESENT_ELEM(PRESENT_ELEM_EXT_OFFS | (id)) if (link) smps_mode = link->smps_mode; else if (sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_DYNAMIC; else smps_mode = IEEE80211_SMPS_OFF; if (link) { /* * 5/10 MHz scenarios are only viable without MLO, in which * case this pointer should be used ... All of this is a bit * unclear though, not sure this even works at all. */ rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) width = chanctx_conf->def.width; rcu_read_unlock(); } sband = local->hw.wiphy->bands[chan->band]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (sband->band == NL80211_BAND_2GHZ) { *capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; *capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } if ((cbss->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && ieee80211_hw_check(&local->hw, SPECTRUM_MGMT)) *capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (sband->band != NL80211_BAND_S1GHZ) ieee80211_assoc_add_rates(local, skb, width, sband, assoc_data); if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT || *capab & WLAN_CAPABILITY_RADIO_MEASURE) { struct cfg80211_chan_def chandef = { .width = width, .chan = chan, }; pos = skb_put(skb, 4); *pos++ = WLAN_EID_PWR_CAPABILITY; *pos++ = 2; *pos++ = 0; /* min tx power */ /* max tx power */ *pos++ = ieee80211_chandef_max_power(&chandef); ADD_PRESENT_ELEM(WLAN_EID_PWR_CAPABILITY); } /* * Per spec, we shouldn't include the list of channels if we advertise * support for extended channel switching, but we've always done that; * (for now?) apply this restriction only on the (new) 6 GHz band. */ if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT && (sband->band != NL80211_BAND_6GHZ || !ext_capa || ext_capa->datalen < 1 || !(ext_capa->data[0] & WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING))) { /* TODO: get this in reg domain format */ pos = skb_put(skb, 2 * sband->n_channels + 2); *pos++ = WLAN_EID_SUPPORTED_CHANNELS; *pos++ = 2 * sband->n_channels; for (i = 0; i < sband->n_channels; i++) { int cf = sband->channels[i].center_freq; *pos++ = ieee80211_frequency_to_channel(cf); *pos++ = 1; /* one channel in the subband*/ } ADD_PRESENT_ELEM(WLAN_EID_SUPPORTED_CHANNELS); } /* if present, add any custom IEs that go before HT */ offset = ieee80211_add_before_ht_elems(skb, extra_elems, extra_elems_len, offset); if (sband->band != NL80211_BAND_6GHZ && assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HT) { ieee80211_add_ht_ie(sdata, skb, assoc_data->link[link_id].ap_ht_param, sband, chan, smps_mode, &assoc_data->link[link_id].conn); ADD_PRESENT_ELEM(WLAN_EID_HT_CAPABILITY); } /* if present, add any custom IEs that go before VHT */ offset = ieee80211_add_before_vht_elems(skb, extra_elems, extra_elems_len, offset); if (sband->band != NL80211_BAND_6GHZ && assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_VHT && sband->vht_cap.vht_supported) { bool mu_mimo_owner = ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->link[link_id].ap_vht_cap, &assoc_data->link[link_id].conn); if (link) link->conf->mu_mimo_owner = mu_mimo_owner; ADD_PRESENT_ELEM(WLAN_EID_VHT_CAPABILITY); } /* if present, add any custom IEs that go before HE */ offset = ieee80211_add_before_he_elems(skb, extra_elems, extra_elems_len, offset); if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_put_he_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); if (sband->band == NL80211_BAND_6GHZ) ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* * if present, add any custom IEs that go before regulatory * connectivity element */ offset = ieee80211_add_before_reg_conn(skb, extra_elems, extra_elems_len, offset); if (sband->band == NL80211_BAND_6GHZ) { /* * as per Section E.2.7 of IEEE 802.11 REVme D7.0, non-AP STA * capable of operating on the 6 GHz band shall transmit * regulatory connectivity element. */ ieee80211_put_reg_conn(skb, chan->flags); } /* * careful - need to know about all the present elems before * calling ieee80211_assoc_add_ml_elem(), so add this one if * we're going to put it after the ML element */ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY); if (link_id == assoc_data->assoc_link_id) ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa, present_elems, assoc_data); /* crash if somebody gets it wrong */ present_elems = NULL; if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ieee80211_put_eht_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_add_aid_request_ie(sdata, skb); ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); } if (iftd && iftd->vendor_elems.data && iftd->vendor_elems.len) skb_put_data(skb, iftd->vendor_elems.data, iftd->vendor_elems.len); return offset; } static void ieee80211_add_non_inheritance_elem(struct sk_buff *skb, const u16 *outer, const u16 *inner) { unsigned int skb_len = skb->len; bool at_extension = false; bool added = false; int i, j; u8 *len, *list_len = NULL; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_NON_INHERITANCE); for (i = 0; i < PRESENT_ELEMS_MAX && outer[i]; i++) { u16 elem = outer[i]; bool have_inner = false; /* should at least be sorted in the sense of normal -> ext */ WARN_ON(at_extension && elem < PRESENT_ELEM_EXT_OFFS); /* switch to extension list */ if (!at_extension && elem >= PRESENT_ELEM_EXT_OFFS) { at_extension = true; if (!list_len) skb_put_u8(skb, 0); list_len = NULL; } for (j = 0; j < PRESENT_ELEMS_MAX && inner[j]; j++) { if (elem == inner[j]) { have_inner = true; break; } } if (have_inner) continue; if (!list_len) { list_len = skb_put(skb, 1); *list_len = 0; } *list_len += 1; skb_put_u8(skb, (u8)elem); added = true; } /* if we added a list but no extension list, make a zero-len one */ if (added && (!at_extension || !list_len)) skb_put_u8(skb, 0); /* if nothing added remove extension element completely */ if (!added) skb_trim(skb, skb_len); else *len = skb->len - skb_len - 2; } static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 capab, const struct element *ext_capa, const u16 *outer_present_elems, struct ieee80211_mgd_assoc_data *assoc_data) { struct ieee80211_local *local = sdata->local; struct ieee80211_multi_link_elem *ml_elem; struct ieee80211_mle_basic_common_info *common; const struct wiphy_iftype_ext_capab *ift_ext_capa; __le16 eml_capa = 0, mld_capa_ops = 0; unsigned int link_id; u8 *ml_elem_len; void *capab_pos; if (!ieee80211_vif_is_mld(&sdata->vif)) return; ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, ieee80211_vif_type_p2p(&sdata->vif)); if (ift_ext_capa) { eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities); mld_capa_ops = cpu_to_le16(ift_ext_capa->mld_capa_and_ops); } skb_put_u8(skb, WLAN_EID_EXTENSION); ml_elem_len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK); ml_elem = skb_put(skb, sizeof(*ml_elem)); ml_elem->control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC | IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP); common = skb_put(skb, sizeof(*common)); common->len = sizeof(*common) + 2; /* MLD capa/ops */ memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN); /* add EML_CAPA only if needed, see Draft P802.11be_D2.1, 35.3.17 */ if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) { common->len += 2; /* EML capabilities */ ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EML_CAPA); skb_put_data(skb, &eml_capa, sizeof(eml_capa)); } skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); if (assoc_data->ext_mld_capa_ops) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP); common->len += 2; skb_put_data(skb, &assoc_data->ext_mld_capa_ops, sizeof(assoc_data->ext_mld_capa_ops)); } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; const u8 *extra_elems; size_t extra_elems_len; size_t extra_used; u8 *subelem_len = NULL; __le16 ctrl; if (!assoc_data->link[link_id].bss || link_id == assoc_data->assoc_link_id) continue; extra_elems = assoc_data->link[link_id].elems; extra_elems_len = assoc_data->link[link_id].elems_len; skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); subelem_len = skb_put(skb, 1); ctrl = cpu_to_le16(link_id | IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE | IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT); skb_put_data(skb, &ctrl, sizeof(ctrl)); skb_put_u8(skb, 1 + ETH_ALEN); /* STA Info Length */ skb_put_data(skb, assoc_data->link[link_id].addr, ETH_ALEN); /* * Now add the contents of the (re)association request, * but the "listen interval" and "current AP address" * (if applicable) are skipped. So we only have * the capability field (remember the position and fill * later), followed by the elements added below by * calling ieee80211_add_link_elems(). */ capab_pos = skb_put(skb, 2); extra_used = ieee80211_add_link_elems(sdata, skb, &capab, ext_capa, extra_elems, extra_elems_len, link_id, NULL, link_present_elems, assoc_data); if (extra_elems) skb_put_data(skb, extra_elems + extra_used, extra_elems_len - extra_used); put_unaligned_le16(capab, capab_pos); ieee80211_add_non_inheritance_elem(skb, outer_present_elems, link_present_elems); ieee80211_fragment_element(skb, subelem_len, IEEE80211_MLE_SUBELEM_FRAGMENT); } ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); } static int ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype iftype, struct cfg80211_bss *cbss, size_t elems_len) { struct ieee80211_local *local = sdata->local; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; size_t size = 0; if (!cbss) return size; sband = local->hw.wiphy->bands[cbss->channel->band]; /* add STA profile elements length */ size += elems_len; /* and supported rates length */ size += 4 + sband->n_bitrates; /* supported channels */ size += 2 + 2 * sband->n_channels; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (iftd) size += iftd->vendor_elems.len; /* power capability */ size += 4; /* HT, VHT, HE, EHT */ size += 2 + sizeof(struct ieee80211_ht_cap); size += 2 + sizeof(struct ieee80211_vht_cap); size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN; if (sband->band == NL80211_BAND_6GHZ) { size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); /* reg connection */ size += 4; } size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + sizeof(struct ieee80211_eht_mcs_nss_supp) + IEEE80211_EHT_PPE_THRES_MAX_LEN; return size; } static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_link_data *link; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, qos_info, *ie_start; size_t offset, noffset; u16 capab = 0, link_capab; __le16 listen_int; struct element *ext_capa = NULL; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct ieee80211_prep_tx_info info = {}; unsigned int link_id, n_links = 0; u16 present_elems[PRESENT_ELEMS_MAX] = {}; void *capab_pos; size_t size; int ret; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) ext_capa = (void *)cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, assoc_data->ie, assoc_data->ie_len); lockdep_assert_wiphy(sdata->local->hw.wiphy); size = local->hw.extra_tx_headroom + sizeof(*mgmt) + /* bit too much but doesn't matter */ 2 + assoc_data->ssid_len + /* SSID */ assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9; /* WMM */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; size_t elems_len = assoc_data->link[link_id].elems_len; if (!cbss) continue; n_links++; size += ieee80211_link_common_elems_size(sdata, iftype, cbss, elems_len); /* non-inheritance element */ size += 2 + 2 + PRESENT_ELEMS_MAX; /* should be the same across all BSSes */ if (cbss->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; } if (ieee80211_vif_is_mld(&sdata->vif)) { /* consider the multi-link element with STA profile */ size += sizeof(struct ieee80211_multi_link_elem); /* max common info field in basic multi-link element */ size += sizeof(struct ieee80211_mle_basic_common_info) + 2 + /* capa & op */ 2 + /* ext capa & op */ 2; /* EML capa */ /* The capability elements were already considered above */ size += (n_links - 1) * (1 + 1 + /* subelement ID/length */ 2 + /* STA control */ 1 + ETH_ALEN + 2 /* STA Info field */); } link = sdata_dereference(sdata->link[assoc_data->assoc_link_id], sdata); if (WARN_ON(!link)) return -EINVAL; if (WARN_ON(!assoc_data->link[assoc_data->assoc_link_id].bss)) return -EINVAL; skb = alloc_skb(size, GFP_KERNEL); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) capab |= WLAN_CAPABILITY_RADIO_MEASURE; /* Set MBSSID support for HE AP if needed */ if (ieee80211_hw_check(&local->hw, SUPPORTS_ONLY_HE_MULTI_BSSID) && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && ext_capa && ext_capa->datalen >= 3) ext_capa->data[2] |= WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT; mgmt = skb_put_zero(skb, 24); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); listen_int = cpu_to_le16(assoc_data->s1g ? ieee80211_encode_usf(local->hw.conf.listen_interval) : local->hw.conf.listen_interval); if (!is_zero_ether_addr(assoc_data->prev_ap_addr)) { skb_put(skb, 10); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ); capab_pos = &mgmt->u.reassoc_req.capab_info; mgmt->u.reassoc_req.listen_interval = listen_int; memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_ap_addr, ETH_ALEN); info.subtype = IEEE80211_STYPE_REASSOC_REQ; } else { skb_put(skb, 4); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ); capab_pos = &mgmt->u.assoc_req.capab_info; mgmt->u.assoc_req.listen_interval = listen_int; info.subtype = IEEE80211_STYPE_ASSOC_REQ; } /* SSID */ pos = skb_put(skb, 2 + assoc_data->ssid_len); ie_start = pos; *pos++ = WLAN_EID_SSID; *pos++ = assoc_data->ssid_len; memcpy(pos, assoc_data->ssid, assoc_data->ssid_len); /* * This bit is technically reserved, so it shouldn't matter for either * the AP or us, but it also means we shouldn't set it. However, we've * always set it in the past, and apparently some EHT APs check that * we don't set it. To avoid interoperability issues with old APs that * for some reason check it and want it to be set, set the bit for all * pre-EHT connections as we used to do. */ if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT && !ieee80211_hw_check(&local->hw, STRICT)) capab |= WLAN_CAPABILITY_ESS; /* add the elements for the assoc (main) link */ link_capab = capab; offset = ieee80211_add_link_elems(sdata, skb, &link_capab, ext_capa, assoc_data->ie, assoc_data->ie_len, assoc_data->assoc_link_id, link, present_elems, assoc_data); put_unaligned_le16(link_capab, capab_pos); /* if present, add any custom non-vendor IEs */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, offset); skb_put_data(skb, assoc_data->ie + offset, noffset - offset); offset = noffset; } if (assoc_data->wmm) { if (assoc_data->uapsd) { qos_info = ifmgd->uapsd_queues; qos_info |= (ifmgd->uapsd_max_sp_len << IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); } else { qos_info = 0; } pos = ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); } /* add any remaining custom (i.e. vendor specific here) IEs */ if (assoc_data->ie_len) { noffset = assoc_data->ie_len; skb_put_data(skb, assoc_data->ie + offset, noffset - offset); } if (assoc_data->fils_kek_len) { ret = fils_encrypt_assoc_req(skb, assoc_data); if (ret < 0) { dev_kfree_skb(skb); return ret; } } pos = skb_tail_pointer(skb); kfree(ifmgd->assoc_req_ies); ifmgd->assoc_req_ies = kmemdup(ie_start, pos - ie_start, GFP_ATOMIC); if (!ifmgd->assoc_req_ies) { dev_kfree_skb(skb); return -ENOMEM; } ifmgd->assoc_req_ies_len = pos - ie_start; info.link_id = assoc_data->assoc_link_id; drv_mgd_prepare_tx(local, sdata, &info); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); return 0; } void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_pspoll *pspoll; struct sk_buff *skb; skb = ieee80211_pspoll_get(&local->hw, &sdata->vif); if (!skb) return; pspoll = (struct ieee80211_pspoll *) skb->data; pspoll->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave) { struct sk_buff *skb; struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, -1, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) return; nullfunc = (struct ieee80211_hdr_3addr *) skb->data; if (powersave) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct sk_buff *skb; struct ieee80211_hdr *nullfunc; __le16 fc; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put_zero(skb, 30); fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); nullfunc->frame_control = fc; memcpy(nullfunc->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(nullfunc->addr4, sdata->vif.addr, ETH_ALEN); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; ieee80211_tx_skb(sdata, skb); } /* spectrum management related things */ static void ieee80211_csa_switch_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.csa.switch_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ret; if (!ieee80211_sdata_running(sdata)) return; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; if (!link->conf->csa_active) return; /* * If the link isn't active (now), we cannot wait for beacons, won't * have a reserved chanctx, etc. Just switch over the chandef and * update cfg80211 directly. */ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) { struct link_sta_info *link_sta; struct sta_info *ap_sta; link->conf->chanreq = link->csa.chanreq; cfg80211_ch_switch_notify(sdata->dev, &link->csa.chanreq.oper, link->link_id); link->conf->csa_active = false; ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!ap_sta)) return; link_sta = wiphy_dereference(wiphy, ap_sta->link[link->link_id]); if (WARN_ON(!link_sta)) return; link_sta->pub->bandwidth = _ieee80211_sta_cur_vht_bw(link_sta, &link->csa.chanreq.oper); return; } /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (link->reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (link->reserved_ready) return; ret = ieee80211_link_use_reserved_context(link); if (ret) { link_info(link, "failed to use reserved channel context, disconnecting (err=%d)\n", ret); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } return; } if (!ieee80211_chanreq_identical(&link->conf->chanreq, &link->csa.chanreq)) { link_info(link, "failed to finalize channel switch, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } link->u.mgd.csa.waiting_bcn = true; /* apply new TPE restrictions immediately on the new channel */ if (link->u.mgd.csa.ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&link->u.mgd.csa.tpe, &link->u.mgd.csa.ap_chandef, &link->conf->chanreq.oper); if (memcmp(&link->conf->tpe, &link->u.mgd.csa.tpe, sizeof(link->u.mgd.csa.tpe))) { link->conf->tpe = link->u.mgd.csa.tpe; ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_TPE); } } /* * It is not necessary to reset these timers if any link does not * have an active CSA and that link still receives the beacons * when other links have active CSA. */ for_each_link_data(sdata, link) { if (!link->conf->csa_active) return; } /* * Reset the beacon monitor and connection monitor timers when CSA * is active for all links in MLO when channel switch occurs in all * the links. */ ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!link->conf->csa_active); ieee80211_vif_unblock_queues_csa(sdata); link->conf->csa_active = false; link->u.mgd.csa.blocked_tx = false; link->u.mgd.csa.waiting_bcn = false; ret = drv_post_channel_switch(link); if (ret) { link_info(link, "driver post channel switch failed, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } cfg80211_ch_switch_notify(sdata->dev, &link->conf->chanreq.oper, link->link_id); } void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_chswitch_done(sdata, success, link_id); rcu_read_lock(); if (!success) { sdata_info(sdata, "driver channel switch failed (link %d), disconnecting\n", link_id); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.csa_connection_drop_work); } else { struct ieee80211_link_data *link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); } rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_chswitch_done); static void ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->abort_channel_switch) return; if (rcu_access_pointer(link->conf->chanctx_conf)) ieee80211_link_unreserve_chanctx(link); ieee80211_vif_unblock_queues_csa(sdata); link->conf->csa_active = false; link->u.mgd.csa.blocked_tx = false; drv_abort_channel_switch(link); } struct sta_csa_rnr_iter_data { struct ieee80211_link_data *link; struct ieee80211_channel *chan; u8 mld_id; }; static enum cfg80211_rnr_iter_ret ieee80211_sta_csa_rnr_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct sta_csa_rnr_iter_data *data = _data; struct ieee80211_link_data *link = data->link; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const struct ieee80211_tbtt_info_ge_11 *ti; enum nl80211_band band; unsigned int center_freq; int link_id; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (tbtt_info_len < sizeof(*ti)) return RNR_ITER_CONTINUE; ti = (const void *)tbtt_info; if (ti->mld_params.mld_id != data->mld_id) return RNR_ITER_CONTINUE; link_id = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); if (link_id != data->link->link_id) return RNR_ITER_CONTINUE; /* we found the entry for our link! */ /* this AP is confused, it had this right before ... just disconnect */ if (!ieee80211_operating_class_to_band(info->op_class, &band)) { link_info(link, "AP now has invalid operating class in RNR, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return RNR_ITER_BREAK; } center_freq = ieee80211_channel_to_frequency(info->channel, band); data->chan = ieee80211_get_channel(sdata->local->hw.wiphy, center_freq); return RNR_ITER_BREAK; } static void ieee80211_sta_other_link_csa_disappeared(struct ieee80211_link_data *link, struct ieee802_11_elems *elems) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sta_csa_rnr_iter_data data = { .link = link, }; /* * If we get here, we see a beacon from another link without * CSA still being reported for it, so now we have to check * if the CSA was aborted or completed. This may not even be * perfectly possible if the CSA was only done for changing * the puncturing, but in that case if the link in inactive * we don't really care, and if it's an active link (or when * it's activated later) we'll get a beacon and adjust. */ if (WARN_ON(!elems->ml_basic)) return; data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); /* * So in order to do this, iterate the RNR element(s) and see * what channel is reported now. */ cfg80211_iter_rnr(elems->ie_start, elems->total_len, ieee80211_sta_csa_rnr_iter, &data); if (!data.chan) { link_info(link, "couldn't find (valid) channel in RNR for CSA, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } /* * If it doesn't match the CSA, then assume it aborted. This * may erroneously detect that it was _not_ aborted when it * was in fact aborted, but only changed the bandwidth or the * puncturing configuration, but we don't have enough data to * detect that. */ if (data.chan != link->csa.chanreq.oper.chan) ieee80211_sta_abort_chanswitch(link); } enum ieee80211_csa_source { IEEE80211_CSA_SOURCE_BEACON, IEEE80211_CSA_SOURCE_OTHER_LINK, IEEE80211_CSA_SOURCE_PROT_ACTION, IEEE80211_CSA_SOURCE_UNPROT_ACTION, }; static void ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, u64 timestamp, u32 device_timestamp, struct ieee802_11_elems *full_elems, struct ieee802_11_elems *csa_elems, enum ieee80211_csa_source source) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_chanctx *chanctx = NULL; struct ieee80211_chanctx_conf *conf; struct ieee80211_csa_ie csa_ie = {}; struct ieee80211_channel_switch ch_switch = { .link_id = link->link_id, .timestamp = timestamp, .device_timestamp = device_timestamp, }; u32 csa_time_tu; ktime_t now; int res; lockdep_assert_wiphy(local->hw.wiphy); if (csa_elems) { struct cfg80211_bss *cbss = link->conf->bss; enum nl80211_band current_band; struct ieee80211_bss *bss; if (WARN_ON(!cbss)) return; current_band = cbss->channel->band; bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, csa_elems, current_band, bss->vht_cap_info, &link->u.mgd.conn, link->u.mgd.bssid, source == IEEE80211_CSA_SOURCE_UNPROT_ACTION, &csa_ie); if (res == 0) { ch_switch.block_tx = csa_ie.mode; ch_switch.chandef = csa_ie.chanreq.oper; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; } link->u.mgd.csa.tpe = csa_elems->csa_tpe; } else { /* * If there was no per-STA profile for this link, we * get called with csa_elems == NULL. This of course means * there are no CSA elements, so set res=1 indicating * no more CSA. */ res = 1; } if (res < 0) { /* ignore this case, not a protected frame */ if (source == IEEE80211_CSA_SOURCE_UNPROT_ACTION) return; goto drop_connection; } if (link->conf->csa_active) { switch (source) { case IEEE80211_CSA_SOURCE_PROT_ACTION: case IEEE80211_CSA_SOURCE_UNPROT_ACTION: /* already processing - disregard action frames */ return; case IEEE80211_CSA_SOURCE_BEACON: if (link->u.mgd.csa.waiting_bcn) { ieee80211_chswitch_post_beacon(link); /* * If the CSA is still present after the switch * we need to consider it as a new CSA (possibly * to self). This happens by not returning here * so we'll get to the check below. */ } else if (res) { ieee80211_sta_abort_chanswitch(link); return; } else { drv_channel_switch_rx_beacon(sdata, &ch_switch); return; } break; case IEEE80211_CSA_SOURCE_OTHER_LINK: /* active link: we want to see the beacon to continue */ if (ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; /* switch work ran, so just complete the process */ if (link->u.mgd.csa.waiting_bcn) { ieee80211_chswitch_post_beacon(link); /* * If the CSA is still present after the switch * we need to consider it as a new CSA (possibly * to self). This happens by not returning here * so we'll get to the check below. */ break; } /* link still has CSA but we already know, do nothing */ if (!res) return; /* check in the RNR if the CSA aborted */ ieee80211_sta_other_link_csa_disappeared(link, full_elems); return; } } /* no active CSA nor a new one */ if (res) { /* * However, we may have stopped queues when receiving a public * action frame that couldn't be protected, if it had the quiet * bit set. This is a trade-off, we want to be quiet as soon as * possible, but also don't trust the public action frame much, * as it can't be protected. */ if (unlikely(link->u.mgd.csa.blocked_tx)) { link->u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); } return; } /* * We don't really trust public action frames, but block queues (go to * quiet mode) for them anyway, we should get a beacon soon to either * know what the CSA really is, or figure out the public action frame * was actually an attack. */ if (source == IEEE80211_CSA_SOURCE_UNPROT_ACTION) { if (csa_ie.mode) { link->u.mgd.csa.blocked_tx = true; ieee80211_vif_block_queues_csa(sdata); } return; } if (link->conf->chanreq.oper.chan->band != csa_ie.chanreq.oper.chan->band) { link_info(link, "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", link->u.mgd.bssid, csa_ie.chanreq.oper.chan->center_freq, csa_ie.chanreq.oper.width, csa_ie.chanreq.oper.center_freq1, csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chanreq.oper, IEEE80211_CHAN_DISABLED)) { link_info(link, "AP %pM switches to unsupported channel (%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), disconnecting\n", link->u.mgd.bssid, csa_ie.chanreq.oper.chan->center_freq, csa_ie.chanreq.oper.chan->freq_offset, csa_ie.chanreq.oper.width, csa_ie.chanreq.oper.center_freq1, csa_ie.chanreq.oper.freq1_offset, csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (cfg80211_chandef_identical(&csa_ie.chanreq.oper, &link->conf->chanreq.oper) && (!csa_ie.mode || source != IEEE80211_CSA_SOURCE_BEACON)) { if (link->u.mgd.csa.ignored_same_chan) return; link_info(link, "AP %pM tries to chanswitch to same channel, ignore\n", link->u.mgd.bssid); link->u.mgd.csa.ignored_same_chan = true; return; } /* * Drop all TDLS peers on the affected link - either we disconnect or * move to a different channel from this point on. There's no telling * what our peer will do. * The TDLS WIDER_BW scenario is also problematic, as peers might now * have an incompatible wider chandef. */ ieee80211_teardown_tdls_peers(link); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && !conf) { link_info(link, "no channel context assigned to vif?, disconnecting\n"); goto drop_connection; } if (conf) chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (!ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { link_info(link, "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; } if (drv_pre_channel_switch(sdata, &ch_switch)) { link_info(link, "preparing for channel switch failed, disconnecting\n"); goto drop_connection; } link->u.mgd.csa.ap_chandef = csa_ie.chanreq.ap; link->csa.chanreq.oper = csa_ie.chanreq.oper; ieee80211_set_chanreq_ap(sdata, &link->csa.chanreq, &link->u.mgd.conn, &csa_ie.chanreq.ap); if (chanctx) { res = ieee80211_link_reserve_chanctx(link, &link->csa.chanreq, chanctx->mode, false); if (res) { link_info(link, "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", res); goto drop_connection; } } link->conf->csa_active = true; link->u.mgd.csa.ignored_same_chan = false; link->u.mgd.beacon_crc_valid = false; link->u.mgd.csa.blocked_tx = csa_ie.mode; if (csa_ie.mode) ieee80211_vif_block_queues_csa(sdata); cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper, link->link_id, csa_ie.count, csa_ie.mode); /* we may have to handle timeout for deactivated link in software */ now = ktime_get_boottime(); csa_time_tu = (max_t(int, csa_ie.count, 1) - 1) * link->conf->beacon_int; link->u.mgd.csa.time = now + us_to_ktime(ieee80211_tu_to_usec(csa_time_tu)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && local->ops->channel_switch) { /* * Use driver's channel switch callback, the driver will * later call ieee80211_chswitch_done(). It may deactivate * the link as well, we handle that elsewhere and queue * the csa.switch_work for the calculated time then. */ drv_channel_switch(local, sdata, &ch_switch); return; } /* channel switch handled in software */ wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - now); return; drop_connection: /* * This is just so that the disconnect flow will know that * we were trying to switch channel and failed. In case the * mode is 1 (we are not allowed to Tx), we will know not to * send a deauthentication frame. Those two fields will be * reset when the disconnection worker runs. */ link->conf->csa_active = true; link->u.mgd.csa.blocked_tx = csa_ie.mode; wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } struct sta_bss_param_ch_cnt_data { struct ieee80211_sub_if_data *sdata; u8 reporting_link_id; u8 mld_id; }; static enum cfg80211_rnr_iter_ret ieee80211_sta_bss_param_ch_cnt_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct sta_bss_param_ch_cnt_data *data = _data; struct ieee80211_sub_if_data *sdata = data->sdata; const struct ieee80211_tbtt_info_ge_11 *ti; u8 bss_param_ch_cnt; int link_id; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (tbtt_info_len < sizeof(*ti)) return RNR_ITER_CONTINUE; ti = (const void *)tbtt_info; if (ti->mld_params.mld_id != data->mld_id) return RNR_ITER_CONTINUE; link_id = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); bss_param_ch_cnt = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); if (bss_param_ch_cnt != 255 && link_id < ARRAY_SIZE(sdata->link)) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); if (link && link->conf->bss_param_ch_cnt != bss_param_ch_cnt) { link->conf->bss_param_ch_cnt = bss_param_ch_cnt; link->conf->bss_param_ch_cnt_link_id = data->reporting_link_id; } } return RNR_ITER_CONTINUE; } static void ieee80211_mgd_update_bss_param_ch_cnt(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf, struct ieee802_11_elems *elems) { struct sta_bss_param_ch_cnt_data data = { .reporting_link_id = bss_conf->link_id, .sdata = sdata, }; int bss_param_ch_cnt; if (!elems->ml_basic) return; data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); cfg80211_iter_rnr(elems->ie_start, elems->total_len, ieee80211_sta_bss_param_ch_cnt_iter, &data); bss_param_ch_cnt = ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); /* * Update bss_param_ch_cnt_link_id even if bss_param_ch_cnt * didn't change to indicate that we got a beacon on our own * link. */ if (bss_param_ch_cnt >= 0 && bss_param_ch_cnt != 255) { bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = bss_conf->link_id; } } static bool ieee80211_find_80211h_pwr_constr(struct ieee80211_channel *channel, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_elem, int *chan_pwr, int *pwr_reduction) { struct ieee80211_country_ie_triplet *triplet; int chan = ieee80211_frequency_to_channel(channel->center_freq); int i, chan_increment; bool have_chan_pwr = false; /* Invalid IE */ if (country_ie_len % 2 || country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) return false; triplet = (void *)(country_ie + 3); country_ie_len -= 3; switch (channel->band) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: case NL80211_BAND_LC: chan_increment = 1; break; case NL80211_BAND_5GHZ: chan_increment = 4; break; case NL80211_BAND_6GHZ: /* * In the 6 GHz band, the "maximum transmit power level" * field in the triplets is reserved, and thus will be * zero and we shouldn't use it to control TX power. * The actual TX power will be given in the transmit * power envelope element instead. */ return false; } /* find channel */ while (country_ie_len >= 3) { u8 first_channel = triplet->chans.first_channel; if (first_channel >= IEEE80211_COUNTRY_EXTENSION_ID) goto next; for (i = 0; i < triplet->chans.num_channels; i++) { if (first_channel + i * chan_increment == chan) { have_chan_pwr = true; *chan_pwr = triplet->chans.max_power; break; } } if (have_chan_pwr) break; next: triplet++; country_ie_len -= 3; } if (have_chan_pwr && pwr_constr_elem) *pwr_reduction = *pwr_constr_elem; else *pwr_reduction = 0; return have_chan_pwr; } static void ieee80211_find_cisco_dtpc(struct ieee80211_channel *channel, const u8 *cisco_dtpc_ie, int *pwr_level) { /* From practical testing, the first data byte of the DTPC element * seems to contain the requested dBm level, and the CLI on Cisco * APs clearly state the range is -127 to 127 dBm, which indicates * a signed byte, although it seemingly never actually goes negative. * The other byte seems to always be zero. */ *pwr_level = (__s8)cisco_dtpc_ie[4]; } static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_ie, const u8 *cisco_dtpc_ie) { struct ieee80211_sub_if_data *sdata = link->sdata; bool has_80211h_pwr = false, has_cisco_pwr = false; int chan_pwr = 0, pwr_reduction_80211h = 0; int pwr_level_cisco, pwr_level_80211h; int new_ap_level; __le16 capab = mgmt->u.probe_resp.capab_info; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) return 0; /* TODO */ if (country_ie && (capab & cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT) || capab & cpu_to_le16(WLAN_CAPABILITY_RADIO_MEASURE))) { has_80211h_pwr = ieee80211_find_80211h_pwr_constr( channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); pwr_level_80211h = max_t(int, 0, chan_pwr - pwr_reduction_80211h); } if (cisco_dtpc_ie) { ieee80211_find_cisco_dtpc( channel, cisco_dtpc_ie, &pwr_level_cisco); has_cisco_pwr = true; } if (!has_80211h_pwr && !has_cisco_pwr) return 0; /* If we have both 802.11h and Cisco DTPC, apply both limits * by picking the smallest of the two power levels advertised. */ if (has_80211h_pwr && (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { new_ap_level = pwr_level_80211h; if (link->ap_power_level == new_ap_level) return 0; sdata_dbg(sdata, "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", pwr_level_80211h, chan_pwr, pwr_reduction_80211h, link->u.mgd.bssid); } else { /* has_cisco_pwr is always true here. */ new_ap_level = pwr_level_cisco; if (link->ap_power_level == new_ap_level) return 0; sdata_dbg(sdata, "Limiting TX power to %d dBm as advertised by %pM\n", pwr_level_cisco, link->u.mgd.bssid); } link->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(link)) return BSS_CHANGED_TXPOWER; return 0; } /* powersave */ static void ieee80211_enable_ps(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_conf *conf = &local->hw.conf; /* * If we are scanning right now then the parameters will * take effect when scan finishes. */ if (local->scanning) return; if (conf->dynamic_ps_timeout > 0 && !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) ieee80211_send_nullfunc(local, sdata, true); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) return; conf->flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } static void ieee80211_change_ps(struct ieee80211_local *local) { struct ieee80211_conf *conf = &local->hw.conf; if (local->ps_sdata) { ieee80211_enable_ps(local, local->ps_sdata); } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); } } static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *mgd = &sdata->u.mgd; struct sta_info *sta = NULL; bool authorized = false; if (!mgd->powersave) return false; if (mgd->broken_ap) return false; if (!mgd->associated) return false; if (mgd->flags & IEEE80211_STA_CONNECTION_POLL) return false; if (!(local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) && !sdata->deflink.u.mgd.have_beacon) return false; rcu_read_lock(); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (sta) authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); rcu_read_unlock(); return authorized; } /* need to hold RTNL or interface lock */ void ieee80211_recalc_ps(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *found = NULL; int count = 0; int timeout; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS) || ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { local->ps_sdata = NULL; return; } list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_AP) { /* If an AP vif is found, then disable PS * by setting the count to zero thereby setting * ps_sdata to NULL. */ count = 0; break; } if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; found = sdata; count++; } if (count == 1 && ieee80211_powersave_allowed(found)) { u8 dtimper = found->deflink.u.mgd.dtim_period; timeout = local->dynamic_ps_forced_timeout; if (timeout < 0) timeout = 100; local->hw.conf.dynamic_ps_timeout = timeout; /* If the TIM IE is invalid, pretend the value is 1 */ if (!dtimper) dtimper = 1; local->hw.conf.ps_dtim_period = dtimper; local->ps_sdata = found; } else { local->ps_sdata = NULL; } ieee80211_change_ps(local); } void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata) { bool ps_allowed = ieee80211_powersave_allowed(sdata); if (sdata->vif.cfg.ps != ps_allowed) { sdata->vif.cfg.ps = ps_allowed; ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_PS); } } void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_disable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS, false); } void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_enable_work); struct ieee80211_sub_if_data *sdata = local->ps_sdata; struct ieee80211_if_managed *ifmgd; unsigned long flags; int q; /* can only happen when PS was just disabled anyway */ if (!sdata) return; ifmgd = &sdata->u.mgd; if (local->hw.conf.flags & IEEE80211_CONF_PS) return; if (local->hw.conf.dynamic_ps_timeout > 0) { /* don't enter PS if TX frames are pending */ if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); return; } /* * transmission can be stopped by others which leads to * dynamic_ps_timer expiry. Postpone the ps timer if it * is not the actual idle state. */ spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (q = 0; q < local->hw.queues; q++) { if (local->queue_stop_reasons[q]) { spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); return; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); } else { ieee80211_send_nullfunc(local, sdata, true); /* Flush to get the tx status of nullfunc frame */ ieee80211_flush_queues(local, sdata, false); } } if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) || (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } void ieee80211_dynamic_ps_timer(struct timer_list *t) { struct ieee80211_local *local = timer_container_of(local, t, dynamic_ps_timer); wiphy_work_queue(local->hw.wiphy, &local->dynamic_ps_enable_work); } void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, dfs_cac_timer_work.work); struct cfg80211_chan_def chandef = link->conf->chanreq.oper; struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (sdata->wdev.links[link->link_id].cac_started) { ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_FINISHED, GFP_KERNEL, link->link_id); } } static bool __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool ret = false; int ac; if (local->hw.queues < IEEE80211_NUM_ACS) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; int non_acm_ac; unsigned long now = jiffies; if (tx_tspec->action == TX_TSPEC_ACTION_NONE && tx_tspec->admitted_time && time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->consumed_tx_time = 0; tx_tspec->time_slice_start = now; if (tx_tspec->downgraded) tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; } switch (tx_tspec->action) { case TX_TSPEC_ACTION_STOP_DOWNGRADE: /* take the original parameters */ if (drv_conf_tx(local, &sdata->deflink, ac, &sdata->deflink.tx_conf[ac])) link_err(&sdata->deflink, "failed to set TX queue parameters for queue %d\n", ac); tx_tspec->action = TX_TSPEC_ACTION_NONE; tx_tspec->downgraded = false; ret = true; break; case TX_TSPEC_ACTION_DOWNGRADE: if (time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->action = TX_TSPEC_ACTION_NONE; ret = true; break; } /* downgrade next lower non-ACM AC */ for (non_acm_ac = ac + 1; non_acm_ac < IEEE80211_NUM_ACS; non_acm_ac++) if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac))) break; /* Usually the loop will result in using BK even if it * requires admission control, but such a configuration * makes no sense and we have to transmit somehow - the * AC selection does the same thing. * If we started out trying to downgrade from BK, then * the extra condition here might be needed. */ if (non_acm_ac >= IEEE80211_NUM_ACS) non_acm_ac = IEEE80211_AC_BK; if (drv_conf_tx(local, &sdata->deflink, ac, &sdata->deflink.tx_conf[non_acm_ac])) link_err(&sdata->deflink, "failed to set TX queue parameters for queue %d\n", ac); tx_tspec->action = TX_TSPEC_ACTION_NONE; ret = true; wiphy_delayed_work_queue(local->hw.wiphy, &ifmgd->tx_tspec_wk, tx_tspec->time_slice_start + HZ - now + 1); break; case TX_TSPEC_ACTION_NONE: /* nothing now */ break; } } return ret; } void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { if (__ieee80211_sta_handle_tspec_ac_params(sdata)) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_QOS); } static void ieee80211_sta_handle_tspec_ac_params_wk(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata; sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.tx_tspec_wk.work); ieee80211_sta_handle_tspec_ac_params(sdata); } void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_tx_queue_params *params = link->tx_conf; u8 ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { mlme_dbg(sdata, "WMM AC=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n", ac, params[ac].acm, params[ac].aifs, params[ac].cw_min, params[ac].cw_max, params[ac].txop, params[ac].uapsd, ifmgd->tx_tspec[ac].downgraded); if (!ifmgd->tx_tspec[ac].downgraded && drv_conf_tx(local, link, ac, &params[ac])) link_err(link, "failed to set TX queue parameters for AC %d\n", ac); } } /* MLME */ static bool _ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_link_data *link, const u8 *wmm_param, size_t wmm_param_len, const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t left; int count, mu_edca_count, ac; const u8 *pos; u8 uapsd_queues = 0; if (!local->ops->conf_tx) return false; if (local->hw.queues < IEEE80211_NUM_ACS) return false; if (!wmm_param) return false; if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return false; if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) uapsd_queues = ifmgd->uapsd_queues; count = wmm_param[6] & 0x0f; /* -1 is the initial value of ifmgd->mu_edca_last_param_set. * if mu_edca was preset before and now it disappeared tell * the driver about it. */ mu_edca_count = mu_edca ? mu_edca->mu_qos_info & 0x0f : -1; if (count == link->u.mgd.wmm_last_param_set && mu_edca_count == link->u.mgd.mu_edca_last_param_set) return false; link->u.mgd.wmm_last_param_set = count; link->u.mgd.mu_edca_last_param_set = mu_edca_count; pos = wmm_param + 8; left = wmm_param_len - 8; memset(&params, 0, sizeof(params)); sdata->wmm_acm = 0; for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; int acm = (pos[0] >> 4) & 0x01; bool uapsd = false; switch (aci) { case 1: /* AC_BK */ ac = IEEE80211_AC_BK; if (acm) sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; if (acm) sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; if (acm) sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: ac = IEEE80211_AC_BE; if (acm) sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } params[ac].aifs = pos[0] & 0x0f; if (params[ac].aifs < 2) { link_info(link, "AP has invalid WMM params (AIFSN=%d for ACI %d), will use 2\n", params[ac].aifs, aci); params[ac].aifs = 2; } params[ac].cw_max = ecw2cw((pos[1] & 0xf0) >> 4); params[ac].cw_min = ecw2cw(pos[1] & 0x0f); params[ac].txop = get_unaligned_le16(pos + 2); params[ac].acm = acm; params[ac].uapsd = uapsd; if (params[ac].cw_min == 0 || params[ac].cw_min > params[ac].cw_max) { link_info(link, "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n", params[ac].cw_min, params[ac].cw_max, aci); return false; } ieee80211_regulatory_limit_wmm_params(sdata, &params[ac], ac); } /* WMM specification requires all 4 ACIs. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (params[ac].cw_min == 0) { link_info(link, "AP has invalid WMM params (missing AC %d), using defaults\n", ac); return false; } } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link->tx_conf[ac] = params[ac]; return true; } static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_link_data *link, const u8 *wmm_param, size_t wmm_param_len, const struct ieee80211_mu_edca_param_set *mu_edca) { if (!_ieee80211_sta_wmm_params(local, link, wmm_param, wmm_param_len, mu_edca)) return false; ieee80211_mgd_set_link_qos_params(link); /* enable WMM or activate new settings */ link->conf->qos = true; return true; } static void __ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) { lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.flags &= ~IEEE80211_STA_CONNECTION_POLL; ieee80211_run_deferred_scan(sdata->local); } static void ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) { lockdep_assert_wiphy(sdata->local->hw.wiphy); __ieee80211_stop_poll(sdata); } static u64 ieee80211_handle_bss_capability(struct ieee80211_link_data *link, u16 capab, bool erp_valid, u8 erp) { struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_supported_band *sband; u64 changed = 0; bool use_protection; bool use_short_preamble; bool use_short_slot; sband = ieee80211_get_link_sband(link); if (!sband) return changed; if (erp_valid) { use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0; use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0; } else { use_protection = false; use_short_preamble = !!(capab & WLAN_CAPABILITY_SHORT_PREAMBLE); } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); if (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { bss_conf->use_cts_prot = use_protection; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (use_short_preamble != bss_conf->use_short_preamble) { bss_conf->use_short_preamble = use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (use_short_slot != bss_conf->use_short_slot) { bss_conf->use_short_slot = use_short_slot; changed |= BSS_CHANGED_ERP_SLOT; } return changed; } static u64 ieee80211_link_set_associated(struct ieee80211_link_data *link, struct cfg80211_bss *cbss) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_bss *bss = (void *)cbss->priv; u64 changed = BSS_CHANGED_QOS; /* not really used in MLO */ sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec(beacon_loss_count * bss_conf->beacon_int)); changed |= ieee80211_handle_bss_capability(link, bss_conf->assoc_capability, bss->has_erp_value, bss->erp_value); ieee80211_check_rate_mask(link); link->conf->bss = cbss; memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN); if (sdata->vif.p2p || sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { int ret; ret = cfg80211_get_p2p_attr( ies->data, ies->len, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, (u8 *) &bss_conf->p2p_noa_attr, sizeof(bss_conf->p2p_noa_attr)); if (ret >= 2) { link->u.mgd.p2p_noa_index = bss_conf->p2p_noa_attr.index; changed |= BSS_CHANGED_P2P_PS; } } rcu_read_unlock(); } if (link->u.mgd.have_beacon) { bss_conf->beacon_rate = bss->beacon_rate; changed |= BSS_CHANGED_BEACON_INFO; } else { bss_conf->beacon_rate = NULL; } /* Tell the driver to monitor connection quality (if supported) */ if (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI && bss_conf->cqm_rssi_thold) changed |= BSS_CHANGED_CQM; return changed; } static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, u64 changed[IEEE80211_MLD_MAX_NUM_LINKS]) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; u64 vif_changed = BSS_CHANGED_ASSOC; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); sdata->u.mgd.associated = true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; if (!cbss || assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; if (ieee80211_vif_is_mld(&sdata->vif) && !(ieee80211_vif_usable_links(&sdata->vif) & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; changed[link_id] |= ieee80211_link_set_associated(link, cbss); } /* just to be sure */ ieee80211_stop_poll(sdata); ieee80211_led_assoc(local, 1); vif_cfg->assoc = 1; /* Enable ARP filtering */ if (vif_cfg->arp_addr_cnt) vif_changed |= BSS_CHANGED_ARP_FILTER; if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; if (!cbss || !(BIT(link_id) & ieee80211_vif_usable_links(&sdata->vif)) || assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; ieee80211_link_info_change_notify(sdata, link, changed[link_id]); ieee80211_recalc_smps(sdata, link); } ieee80211_vif_cfg_change_notify(sdata, vif_changed); } else { ieee80211_bss_info_change_notify(sdata, vif_changed | changed[0]); } ieee80211_recalc_ps(local); /* leave this here to not change ordering in non-MLO cases */ if (!ieee80211_vif_is_mld(&sdata->vif)) ieee80211_recalc_smps(sdata, &sdata->deflink); ieee80211_recalc_ps_vif(sdata); netif_carrier_on(sdata->dev); } static void ieee80211_ml_reconf_reset(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *add_links_data = sdata->u.mgd.reconf.add_links_data; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->u.mgd.reconf.added_links | sdata->u.mgd.reconf.removed_links)) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.reconf.wk); sdata->u.mgd.reconf.added_links = 0; sdata->u.mgd.reconf.removed_links = 0; sdata->u.mgd.reconf.dialog_token = 0; if (add_links_data) { struct cfg80211_mlo_reconf_done_data done_data = {}; u8 link_id; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) done_data.links[link_id].bss = add_links_data->link[link_id].bss; cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); kfree(sdata->u.mgd.reconf.add_links_data); sdata->u.mgd.reconf.add_links_data = NULL; } } static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, u16 stype, u16 reason, bool tx, u8 *frame_buf) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); unsigned int link_id; u64 changed = 0; struct ieee80211_prep_tx_info info = { .subtype = stype, .was_assoc = true, .link_id = ffs(sdata->vif.active_links) - 1, }; lockdep_assert_wiphy(local->hw.wiphy); if (frame_buf) memset(frame_buf, 0, IEEE80211_DEAUTH_FRAME_LEN); if (WARN_ON(!ap_sta)) return; if (WARN_ON_ONCE(tx && !frame_buf)) return; if (WARN_ON(!ifmgd->associated)) return; ieee80211_stop_poll(sdata); ifmgd->associated = false; if (tx) { bool tx_link_found = false; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON_ONCE(!link)) continue; if (link->u.mgd.csa.blocked_tx) continue; tx_link_found = true; break; } tx = tx_link_found; } /* other links will be destroyed */ sdata->deflink.conf->bss = NULL; sdata->deflink.conf->epcs_support = false; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); /* * if we want to get out of ps before disassoc (why?) we have * to do it before sending disassoc, as otherwise the null-packet * won't be valid. */ if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } local->ps_sdata = NULL; /* disable per-vif ps */ ieee80211_recalc_ps_vif(sdata); /* make sure ongoing transmission finishes */ synchronize_net(); /* * drop any frame before deauth/disassoc, this can be data or * management frame. Since we are disconnecting, we should not * insist sending these frames which can take time and delay * the disconnection and possible the roaming. */ ieee80211_flush_queues(local, sdata, true); if (tx) { drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, reason, true, frame_buf); /* flush out frame - make sure the deauth was actually sent */ ieee80211_flush_queues(local, sdata, false); drv_mgd_complete_tx(sdata->local, sdata, &info); } else if (frame_buf) { ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, reason, false, frame_buf); } /* clear AP addr only after building the needed mgmt frames */ eth_zero_addr(sdata->deflink.u.mgd.bssid); eth_zero_addr(sdata->vif.cfg.ap_addr); sdata->vif.cfg.ssid_len = 0; /* Remove TDLS peers */ __sta_info_flush(sdata, false, -1, ap_sta); if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { /* Only move the AP state */ sta_info_move_state(ap_sta, IEEE80211_STA_NONE); } else { /* Remove AP peer */ sta_info_flush(sdata, -1); } /* finally reset all BSS / config parameters */ if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= ieee80211_reset_erp_info(sdata); ieee80211_led_assoc(local, 0); changed |= BSS_CHANGED_ASSOC; sdata->vif.cfg.assoc = false; sdata->deflink.u.mgd.p2p_noa_index = -1; memset(&sdata->vif.bss_conf.p2p_noa_attr, 0, sizeof(sdata->vif.bss_conf.p2p_noa_attr)); /* on the next assoc, re-program HT/VHT parameters */ memset(&ifmgd->ht_capa, 0, sizeof(ifmgd->ht_capa)); memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); /* * reset MU-MIMO ownership and group data in default link, * if used, other links are destroyed */ memset(sdata->vif.bss_conf.mu_group.membership, 0, sizeof(sdata->vif.bss_conf.mu_group.membership)); memset(sdata->vif.bss_conf.mu_group.position, 0, sizeof(sdata->vif.bss_conf.mu_group.position)); if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_MU_GROUPS; sdata->vif.bss_conf.mu_mimo_owner = false; sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL; timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); /* Disable ARP filtering */ if (sdata->vif.cfg.arp_addr_cnt) changed |= BSS_CHANGED_ARP_FILTER; sdata->vif.bss_conf.qos = false; if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_QOS; /* The BSSID (not really interesting) and HT changed */ changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT; ieee80211_bss_info_change_notify(sdata, changed); } else { ieee80211_vif_cfg_change_notify(sdata, changed); } if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { /* * After notifying the driver about the disassoc, * remove the ap sta. */ sta_info_flush(sdata, -1); } /* disassociated - set to defaults now */ ieee80211_set_wmm_default(&sdata->deflink, false, false); timer_delete_sync(&sdata->u.mgd.conn_mon_timer); timer_delete_sync(&sdata->u.mgd.bcn_mon_timer); timer_delete_sync(&sdata->u.mgd.timer); sdata->vif.bss_conf.dtim_period = 0; sdata->vif.bss_conf.beacon_rate = NULL; sdata->deflink.u.mgd.have_beacon = false; sdata->deflink.u.mgd.tracking_signal_avg = false; sdata->deflink.u.mgd.disable_wmm_tracking = false; ifmgd->flags = 0; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_link_release_channel(link); } sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.blocked_tx = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.ignored_same_chan = false; ieee80211_vif_unblock_queues_csa(sdata); /* existing TX TSPEC sessions no longer exist */ memset(ifmgd->tx_tspec, 0, sizeof(ifmgd->tx_tspec)); wiphy_delayed_work_cancel(local->hw.wiphy, &ifmgd->tx_tspec_wk); sdata->vif.bss_conf.power_type = IEEE80211_REG_UNSET_AP; sdata->vif.bss_conf.pwr_reduction = 0; ieee80211_clear_tpe(&sdata->vif.bss_conf.tpe); sdata->vif.cfg.eml_cap = 0; sdata->vif.cfg.eml_med_sync_delay = 0; sdata->vif.cfg.mld_capa_op = 0; memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->neg_ttlm_timeout_work); sdata->u.mgd.removed_links = 0; wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->teardown_ttlm_work); /* if disconnection happens in the middle of the ML reconfiguration * flow, cfg80211 must called to release the BSS references obtained * when the flow started. */ ieee80211_ml_reconf_reset(sdata); ieee80211_vif_set_links(sdata, 0, 0); ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; ifmgd->epcs.enabled = false; ifmgd->epcs.dialog_token = 0; memset(ifmgd->userspace_selectors, 0, sizeof(ifmgd->userspace_selectors)); } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (!(ifmgd->flags & IEEE80211_STA_CONNECTION_POLL)) return; __ieee80211_stop_poll(sdata); ieee80211_recalc_ps(local); if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; /* * We've received a probe response, but are not sure whether * we have or will be receiving any beacons or data, so let's * schedule the timers again, just in case. */ ieee80211_sta_reset_beacon_monitor(sdata); mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, u16 tx_time) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 tid; int ac; struct ieee80211_sta_tx_tspec *tx_tspec; unsigned long now = jiffies; if (!ieee80211_is_data_qos(hdr->frame_control)) return; tid = ieee80211_get_tid(hdr); ac = ieee80211_ac_from_tid(tid); tx_tspec = &ifmgd->tx_tspec[ac]; if (likely(!tx_tspec->admitted_time)) return; if (time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->consumed_tx_time = 0; tx_tspec->time_slice_start = now; if (tx_tspec->downgraded) { tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &ifmgd->tx_tspec_wk, 0); } } if (tx_tspec->downgraded) return; tx_tspec->consumed_tx_time += tx_time; if (tx_tspec->consumed_tx_time >= tx_tspec->admitted_time) { tx_tspec->downgraded = true; tx_tspec->action = TX_TSPEC_ACTION_DOWNGRADE; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &ifmgd->tx_tspec_wk, 0); } } void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time) { ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time); if (!ieee80211_is_any_nullfunc(hdr->frame_control) || !sdata->u.mgd.probe_send_count) return; if (ack) sdata->u.mgd.probe_send_count = 0; else sdata->u.mgd.nullfunc_failed = true; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel, ssid, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); if (skb) ieee80211_tx_skb(sdata, skb); } static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 *dst = sdata->vif.cfg.ap_addr; u8 unicast_limit = max(1, max_probe_tries - 3); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Try sending broadcast probe requests for the last three * probe requests after the first ones failed since some * buggy APs only support broadcast probe requests. */ if (ifmgd->probe_send_count >= unicast_limit) dst = NULL; /* * When the hardware reports an accurate Tx ACK status, it's * better to send a nullfunc frame instead of a probe request, * as it will kick us off the AP quickly if we aren't associated * anymore. The timeout will be reset if the frame is ACKed by * the AP. */ ifmgd->probe_send_count++; if (dst) { sta = sta_info_get(sdata, dst); if (!WARN_ON(!sta)) ieee80211_check_fast_rx(sta); } if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; ieee80211_send_nullfunc(sdata->local, sdata, false); } else { ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len, sdata->deflink.conf->bss->channel); } ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms); run_again(sdata, ifmgd->probe_timeout); } static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, bool beacon) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool already = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ieee80211_sdata_running(sdata)) return; if (!ifmgd->associated) return; if (sdata->local->tmp_channel || sdata->local->scanning) return; if (sdata->local->suspending) { /* reschedule after resume */ ieee80211_reset_ap_probe(sdata); return; } if (beacon) { mlme_dbg_ratelimited(sdata, "detected beacon loss from AP (missed %d beacons) - probing\n", beacon_loss_count); ieee80211_cqm_beacon_loss_notify(&sdata->vif, GFP_KERNEL); } /* * The driver/our work has already reported this event or the * connection monitoring has kicked in and we have already sent * a probe request. Or maybe the AP died and the driver keeps * reporting until we disassociate... * * In either case we have to ignore the current call to this * function (except for setting the correct probe reason bit) * because otherwise we would reset the timer every time and * never check whether we received a probe response! */ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) already = true; ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL; if (already) return; ieee80211_recalc_ps(sdata->local); ifmgd->probe_send_count = 0; ieee80211_mgd_probe_ap_send(sdata); } struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct cfg80211_bss *cbss; struct sk_buff *skb; const struct element *ssid; int ssid_len; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || ieee80211_vif_is_mld(&sdata->vif))) return NULL; if (ifmgd->associated) cbss = sdata->deflink.conf->bss; else if (ifmgd->auth_data) cbss = ifmgd->auth_data->bss; else if (ifmgd->assoc_data && ifmgd->assoc_data->link[0].bss) cbss = ifmgd->assoc_data->link[0].bss; else return NULL; rcu_read_lock(); ssid = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); if (WARN_ONCE(!ssid || ssid->datalen > IEEE80211_MAX_SSID_LEN, "invalid SSID element (len=%d)", ssid ? ssid->datalen : -1)) ssid_len = 0; else ssid_len = ssid->datalen; skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, ssid->data, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_ap_probereq_get); static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *buf, size_t len, bool tx, u16 reason, bool reconnect) { struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = tx ? DEAUTH_TX_EVENT : DEAUTH_RX_EVENT, .u.mlme.reason = reason, }; if (tx) cfg80211_tx_mlme_mgmt(sdata->dev, buf, len, reconnect); else cfg80211_rx_mlme_mgmt(sdata->dev, buf, len); drv_event_callback(sdata->local, sdata, &event); } static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; if (!ifmgd->driver_disconnect) { unsigned int link_id; /* * AP is probably out of range (or not reachable for another * reason) so remove the bss structs for that AP. In the case * of multi-link, it's not clear that all of them really are * out of range, but if they weren't the driver likely would * have switched to just have a single link active? */ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link || !link->conf->bss) continue; cfg80211_unlink_bss(local->hw.wiphy, link->conf->bss); link->conf->bss = NULL; } } ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, ifmgd->driver_disconnect ? WLAN_REASON_DEAUTH_LEAVING : WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, true, frame_buf); /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, ifmgd->reconnect); ifmgd->reconnect = false; } static void ieee80211_beacon_connection_loss_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.beacon_connection_loss_work); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (ifmgd->connection_loss) { sdata_info(sdata, "Connection to AP %pM lost\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); ifmgd->connection_loss = false; } else if (ifmgd->driver_disconnect) { sdata_info(sdata, "Driver requested disconnection from AP %pM\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); ifmgd->driver_disconnect = false; } else { if (ifmgd->associated) sdata->deflink.u.mgd.beacon_loss_count++; ieee80211_mgd_probe_ap(sdata, true); } } static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.csa_connection_drop_work); __ieee80211_disconnect(sdata); } void ieee80211_beacon_loss(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_hw *hw = &sdata->local->hw; trace_api_beacon_loss(sdata); sdata->u.mgd.connection_loss = false; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_beacon_loss); void ieee80211_connection_loss(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata; struct ieee80211_hw *hw; KUNIT_STATIC_STUB_REDIRECT(ieee80211_connection_loss, vif); sdata = vif_to_sdata(vif); hw = &sdata->local->hw; trace_api_connection_loss(sdata); sdata->u.mgd.connection_loss = true; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_connection_loss); void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_hw *hw = &sdata->local->hw; trace_api_disconnect(sdata, reconnect); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; sdata->u.mgd.driver_disconnect = true; sdata->u.mgd.reconnect = reconnect; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_disconnect); static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, bool assoc) { struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.auth_data = NULL; if (!assoc) { /* * we are not authenticated yet, the only timer that could be * running is the timeout for the authentication response which * which is not relevant anymore. */ timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, auth_data->ap_addr); /* other links are destroyed */ eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); } cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); kfree(auth_data); } enum assoc_status { ASSOC_SUCCESS, ASSOC_REJECTED, ASSOC_TIMEOUT, ASSOC_ABANDON, }; static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, enum assoc_status status) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.assoc_data = NULL; if (status != ASSOC_SUCCESS) { /* * we are not associated yet, the only timer that could be * running is the timeout for the association response which * which is not relevant anymore. */ timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, assoc_data->ap_addr); eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; sdata->vif.bss_conf.mu_mimo_owner = false; if (status != ASSOC_REJECTED) { struct cfg80211_assoc_failure data = { .timeout = status == ASSOC_TIMEOUT, }; int i; BUILD_BUG_ON(ARRAY_SIZE(data.bss) != ARRAY_SIZE(assoc_data->link)); for (i = 0; i < ARRAY_SIZE(data.bss); i++) data.bss[i] = assoc_data->link[i].bss; if (ieee80211_vif_is_mld(&sdata->vif)) data.ap_mld_addr = assoc_data->ap_addr; cfg80211_assoc_failure(sdata->dev, &data); } ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); } kfree(assoc_data); } static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; const struct element *challenge; u8 *pos; u32 tx_flags = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, .link_id = auth_data->link_id, }; pos = mgmt->u.auth.variable; challenge = cfg80211_find_elem(WLAN_EID_CHALLENGE, pos, len - (pos - (u8 *)mgmt)); if (!challenge) return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata, &info); if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, (void *)challenge, challenge->datalen + sizeof(*challenge), auth_data->ap_addr, auth_data->ap_addr, auth_data->key, auth_data->key_len, auth_data->key_idx, tx_flags); } static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const u8 *ap_addr = ifmgd->auth_data->ap_addr; struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); /* move station state to auth */ sta = sta_info_get(sdata, ap_addr); if (!sta) { WARN_ONCE(1, "%s: STA %pM not found", sdata->name, ap_addr); return false; } if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { sdata_info(sdata, "failed moving %pM to auth\n", ap_addr); return false; } return true; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 auth_alg, auth_transaction, status_code; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, }; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; bool sae_need_confirm = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 6) return; if (!ifmgd->auth_data || ifmgd->auth_data->done) return; if (!ether_addr_equal(ifmgd->auth_data->ap_addr, mgmt->bssid)) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); info.link_id = ifmgd->auth_data->link_id; if (auth_alg != ifmgd->auth_data->algorithm || (auth_alg != WLAN_AUTH_SAE && auth_transaction != ifmgd->auth_data->expected_transaction) || (auth_alg == WLAN_AUTH_SAE && (auth_transaction < ifmgd->auth_data->expected_transaction || auth_transaction > 2))) { sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n", mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, auth_transaction, ifmgd->auth_data->expected_transaction); goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || status_code == WLAN_STATUS_SAE_PK)))) { /* waiting for userspace now */ ifmgd->auth_data->waiting = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); if (auth_transaction == 1) sae_need_confirm = true; goto notify_driver; } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); goto notify_driver; } switch (ifmgd->auth_data->algorithm) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: case WLAN_AUTH_FT: case WLAN_AUTH_SAE: case WLAN_AUTH_FILS_SK: case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { ieee80211_auth_challenge(sdata, mgmt, len); /* need another frame */ return; } break; default: WARN_ONCE(1, "invalid auth alg %d", ifmgd->auth_data->algorithm); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; info.success = 1; drv_event_callback(sdata->local, sdata, &event); if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { if (!ieee80211_mark_sta_auth(sdata)) return; /* ignore frame -- wait for timeout */ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 1) { sae_need_confirm = true; } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); ifmgd->auth_data->peer_confirmed = true; } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); notify_driver: if (!sae_need_confirm) drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ case WLAN_REASON_##type: return #type const char *ieee80211_get_reason_code_string(u16 reason_code) { switch (reason_code) { case_WLAN(UNSPECIFIED); case_WLAN(PREV_AUTH_NOT_VALID); case_WLAN(DEAUTH_LEAVING); case_WLAN(DISASSOC_DUE_TO_INACTIVITY); case_WLAN(DISASSOC_AP_BUSY); case_WLAN(CLASS2_FRAME_FROM_NONAUTH_STA); case_WLAN(CLASS3_FRAME_FROM_NONASSOC_STA); case_WLAN(DISASSOC_STA_HAS_LEFT); case_WLAN(STA_REQ_ASSOC_WITHOUT_AUTH); case_WLAN(DISASSOC_BAD_POWER); case_WLAN(DISASSOC_BAD_SUPP_CHAN); case_WLAN(INVALID_IE); case_WLAN(MIC_FAILURE); case_WLAN(4WAY_HANDSHAKE_TIMEOUT); case_WLAN(GROUP_KEY_HANDSHAKE_TIMEOUT); case_WLAN(IE_DIFFERENT); case_WLAN(INVALID_GROUP_CIPHER); case_WLAN(INVALID_PAIRWISE_CIPHER); case_WLAN(INVALID_AKMP); case_WLAN(UNSUPP_RSN_VERSION); case_WLAN(INVALID_RSN_IE_CAP); case_WLAN(IEEE8021X_FAILED); case_WLAN(CIPHER_SUITE_REJECTED); case_WLAN(DISASSOC_UNSPECIFIED_QOS); case_WLAN(DISASSOC_QAP_NO_BANDWIDTH); case_WLAN(DISASSOC_LOW_ACK); case_WLAN(DISASSOC_QAP_EXCEED_TXOP); case_WLAN(QSTA_LEAVE_QBSS); case_WLAN(QSTA_NOT_USE); case_WLAN(QSTA_REQUIRE_SETUP); case_WLAN(QSTA_TIMEOUT); case_WLAN(QSTA_CIPHER_NOT_SUPP); case_WLAN(MESH_PEER_CANCELED); case_WLAN(MESH_MAX_PEERS); case_WLAN(MESH_CONFIG); case_WLAN(MESH_CLOSE); case_WLAN(MESH_MAX_RETRIES); case_WLAN(MESH_CONFIRM_TIMEOUT); case_WLAN(MESH_INVALID_GTK); case_WLAN(MESH_INCONSISTENT_PARAM); case_WLAN(MESH_INVALID_SECURITY); case_WLAN(MESH_PATH_ERROR); case_WLAN(MESH_PATH_NOFORWARD); case_WLAN(MESH_PATH_DEST_UNREACHABLE); case_WLAN(MAC_EXISTS_IN_MBSS); case_WLAN(MESH_CHAN_REGULATORY); case_WLAN(MESH_CHAN); default: return "<unknown>"; } } static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 2) return; if (!ether_addr_equal(mgmt->bssid, mgmt->sa)) { ieee80211_tdls_handle_disconnect(sdata, mgmt->sa, reason_code); return; } if (ifmgd->associated && ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) { sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", sdata->vif.cfg.ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, false); return; } if (ifmgd->assoc_data && ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->ap_addr)) { sdata_info(sdata, "deauthenticated from %pM while associating (Reason: %u=%s)\n", ifmgd->assoc_data->ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); return; } } static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 2) return; if (!ifmgd->associated || !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) return; reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); if (!ether_addr_equal(mgmt->bssid, mgmt->sa)) { ieee80211_tdls_handle_disconnect(sdata, mgmt->sa, reason_code); return; } sdata_info(sdata, "disassociated from %pM (Reason: %u=%s)\n", sdata->vif.cfg.ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, false); } static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct link_sta_info *link_sta, const struct ieee802_11_elems *elems) { const struct ieee80211_sta_he_cap *own_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (elems->ext_capab_len < 10) return false; if (!(elems->ext_capab[9] & WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT)) return false; return link_sta->pub->he_cap.he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_RES && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_REQ); } static u64 ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct ieee802_11_elems *elems) { bool twt = ieee80211_twt_req_supported(sdata, sband, link_sta, elems); if (link->conf->twt_requester != twt) { link->conf->twt_requester = twt; return BSS_CHANGED_TWT; } return 0; } static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta) { const struct ieee80211_sta_he_cap *own_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); return bss_conf->he_support && (link_sta->pub->he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT) && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT); } static void ieee80211_epcs_changed(struct ieee80211_sub_if_data *sdata, bool enabled) { /* in any case this is called, dialog token should be reset */ sdata->u.mgd.epcs.dialog_token = 0; if (sdata->u.mgd.epcs.enabled == enabled) return; sdata->u.mgd.epcs.enabled = enabled; cfg80211_epcs_changed(sdata->dev, enabled); } static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; u8 link_id; if (!sdata->u.mgd.epcs.enabled) return; lockdep_assert_wiphy(local->hw.wiphy); for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee802_11_elems *elems; struct ieee80211_link_data *link; const struct cfg80211_bss_ies *ies; bool ret; rcu_read_lock(); link = sdata_dereference(sdata->link[link_id], sdata); if (!link || !link->conf || !link->conf->bss) { rcu_read_unlock(); continue; } if (link->u.mgd.disable_wmm_tracking) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } ies = rcu_dereference(link->conf->bss->beacon_ies); if (!ies) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } ret = _ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set); kfree(elems); rcu_read_unlock(); if (!ret) { ieee80211_set_wmm_default(link, false, false); continue; } ieee80211_mgd_set_link_qos_params(link); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, const u8 *elem_start, unsigned int elem_len, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data ?: sdata->u.mgd.reconf.add_links_data; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; struct ieee80211_elems_parse_params parse_params = { .mode = link->u.mgd.conn.mode, .start = elem_start, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; bool is_s1g = cbss->channel->band == NL80211_BAND_S1GHZ; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_supported_band *sband; struct ieee802_11_elems *elems; const __le16 prof_bss_param_ch_present = cpu_to_le16(IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT); u16 capab_info; bool ret; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return false; if (link_id == assoc_data->assoc_link_id) { capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); /* * we should not get to this flow unless the association was * successful, so set the status directly to success */ assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS; if (elems->ml_basic) { int bss_param_ch_cnt = ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); if (bss_param_ch_cnt < 0) { ret = false; goto out; } bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = link_id; } } else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC || !elems->prof || !(elems->prof->control & prof_bss_param_ch_present)) { ret = false; goto out; } else { const u8 *ptr = elems->prof->variable + elems->prof->sta_info_len - 1; int bss_param_ch_cnt; /* * During parsing, we validated that these fields exist, * otherwise elems->prof would have been set to NULL. */ capab_info = get_unaligned_le16(ptr); assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); bss_param_ch_cnt = ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(elems->prof); bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = link_id; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { link_info(link, "association response status code=%u\n", assoc_data->link[link_id].status); ret = true; goto out; } } if (!is_s1g && !elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); ret = false; goto out; } link->u.mgd.tdls_chan_switch_prohibited = elems->ext_capab && elems->ext_capab_len >= 5 && (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their * (re)association response frames. Try to recover by using the data * from the beacon or probe response. This seems to afflict mobile * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ if (!ieee80211_hw_check(&local->hw, STRICT) && !is_6ghz && ((assoc_data->wmm && !elems->wmm_param) || (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems *bss_elems; rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) bss_ies = kmemdup(ies, sizeof(*ies) + ies->len, GFP_ATOMIC); rcu_read_unlock(); if (!bss_ies) { ret = false; goto out; } parse_params.start = bss_ies->data; parse_params.len = bss_ies->len; parse_params.bss = cbss; parse_params.link_id = -1; bss_elems = ieee802_11_parse_elems_full(&parse_params); if (!bss_elems) { ret = false; goto out; } if (assoc_data->wmm && !elems->wmm_param && bss_elems->wmm_param) { elems->wmm_param = bss_elems->wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } /* * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ if (!elems->ht_cap_elem && bss_elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_cap_elem = bss_elems->ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } if (!elems->ht_operation && bss_elems->ht_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_operation = bss_elems->ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } if (is_5ghz) { if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_cap_elem = bss_elems->vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } if (!elems->vht_operation && bss_elems->vht_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_operation = bss_elems->vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } } kfree(bss_elems); } /* * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. * Note that the ieee80211_config_bw() below would also check * for this (and more), but this has better error reporting. */ if (!is_6ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; goto out; } if (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; goto out; } /* check/update if AP changed anything in assoc response vs. scan */ if (ieee80211_config_bw(link, elems, link_id == assoc_data->assoc_link_id, changed, le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE)) { ret = false; goto out; } if (WARN_ON(!link->conf->chanreq.oper.chan)) { ret = false; goto out; } sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band]; /* Set up internal HT/VHT capabilities */ if (elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, elems->ht_cap_elem, link_sta); if (elems->vht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { const struct ieee80211_vht_cap *bss_vht_cap = NULL; const struct cfg80211_bss_ies *ies; /* * Cisco AP module 9115 with FW 17.3 has a bug and sends a * too large maximum MPDU length in the association response * (indicating 12k) that it cannot actually process ... * Work around that. */ rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { const struct element *elem; elem = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, ies->data, ies->len); if (elem && elem->datalen >= sizeof(*bss_vht_cap)) bss_vht_cap = (const void *)elem->data; } if (ieee80211_hw_check(&local->hw, STRICT) && (!bss_vht_cap || memcmp(bss_vht_cap, elems->vht_cap_elem, sizeof(*bss_vht_cap)))) { rcu_read_unlock(); ret = false; link_info(link, "VHT capabilities mismatch\n"); goto out; } ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, bss_vht_cap, link_sta); rcu_read_unlock(); } if (elems->he_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->he_6ghz_capa, link_sta); bss_conf->he_support = link_sta->pub->he_cap.has_he; if (elems->rsnx && elems->rsnx_len && (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_PROTECTED_TWT)) bss_conf->twt_protected = true; else bss_conf->twt_protected = false; *changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (elems->eht_operation && elems->eht_cap && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_EHT) { ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->eht_cap, elems->eht_cap_len, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; bss_conf->epcs_support = bss_conf->eht_support && !!(elems->eht_cap->fixed.mac_cap_info[0] & IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS); /* EPCS might be already enabled but a new added link * does not support EPCS. This should not really happen * in practice. */ if (sdata->u.mgd.epcs.enabled && !bss_conf->epcs_support) ieee80211_epcs_teardown(sdata); } else { bss_conf->eht_support = false; bss_conf->epcs_support = false; } } else { bss_conf->he_support = false; bss_conf->twt_requester = false; bss_conf->twt_protected = false; bss_conf->eht_support = false; bss_conf->epcs_support = false; } if (elems->s1g_oper && link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G && elems->s1g_capab) ieee80211_s1g_cap_to_sta_s1g_cap(sdata, elems->s1g_capab, link_sta); bss_conf->twt_broadcast = ieee80211_twt_bcast_support(sdata, bss_conf, sband, link_sta); if (bss_conf->he_support) { bss_conf->he_bss_color.color = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->he_bss_color.partial = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR); bss_conf->he_bss_color.enabled = !le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (bss_conf->he_bss_color.enabled) *changed |= BSS_CHANGED_HE_BSS_COLOR; bss_conf->htc_trig_based_pkt_ext = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->uora_exists = !!elems->uora_element; if (elems->uora_element) bss_conf->uora_ocw_range = elems->uora_element[0]; ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } if (cbss->transmitted_bss) { bss_conf->nontransmitted = true; ether_addr_copy(bss_conf->transmitter_bssid, cbss->transmitted_bss->bssid); bss_conf->bssid_indicator = cbss->max_bssid_indicator; bss_conf->bssid_index = cbss->bssid_index; } /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own * configuration. If it changed since the last beacon, we'll get the * next beacon and update then. */ /* * If an operating mode notification IE is present, override the * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ if (elems->opmode_notif && !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; link_sta->pub->rx_nss = nss; } /* * Always handle WMM once after association regardless * of the first value the AP uses. Setting -1 here has * that effect because the AP values is an unsigned * 4-bit value. */ link->u.mgd.wmm_last_param_set = -1; link->u.mgd.mu_edca_last_param_set = -1; if (link->u.mgd.disable_wmm_tracking) { ieee80211_set_wmm_default(link, false, false); } else if (!ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(link, false, true); /* disable WMM tracking in this case to disable * tracking WMM parameter changes in the beacon if * the parameters weren't actually valid. Doing so * avoids changing parameters very strangely when * the AP is going back and forth between valid and * invalid parameters. */ link->u.mgd.disable_wmm_tracking = true; } if (elems->max_idle_period_ie) { bss_conf->max_idle_period = le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); *changed |= BSS_CHANGED_KEEP_ALIVE; } else { bss_conf->max_idle_period = 0; bss_conf->protected_keep_alive = false; } /* set assoc capability (AID was already set earlier), * ieee80211_set_associated() will tell the driver */ bss_conf->assoc_capability = capab_info; ret = true; out: kfree(elems); kfree(bss_ies); return ret; } static int ieee80211_mgd_setup_link_sta(struct ieee80211_link_data *link, struct sta_info *sta, struct link_sta_info *link_sta, struct cfg80211_bss *cbss) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss = (void *)cbss->priv; u32 rates = 0, basic_rates = 0; bool have_higher_than_11mbit = false; int min_rate = INT_MAX, min_rate_index = -1; struct ieee80211_supported_band *sband; memcpy(link_sta->addr, cbss->bssid, ETH_ALEN); memcpy(link_sta->pub->addr, cbss->bssid, ETH_ALEN); /* TODO: S1G Basic Rate Set is expressed elsewhere */ if (cbss->channel->band == NL80211_BAND_S1GHZ) { ieee80211_s1g_sta_rate_init(sta); return 0; } sband = local->hw.wiphy->bands[cbss->channel->band]; ieee80211_get_rates(sband, bss->supp_rates, bss->supp_rates_len, NULL, 0, &rates, &basic_rates, NULL, &have_higher_than_11mbit, &min_rate, &min_rate_index); /* * This used to be a workaround for basic rates missing * in the association response frame. Now that we no * longer use the basic rates from there, it probably * doesn't happen any more, but keep the workaround so * in case some *other* APs are buggy in different ways * we can connect -- with a warning. * Allow this workaround only in case the AP provided at least * one rate. */ if (min_rate_index < 0) { link_info(link, "No legacy rates in association response\n"); return -EINVAL; } else if (!basic_rates) { link_info(link, "No basic rates, using min rate instead\n"); basic_rates = BIT(min_rate_index); } if (rates) link_sta->pub->supp_rates[cbss->channel->band] = rates; else link_info(link, "No rates found, keeping mandatory only\n"); link->conf->basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ link->operating_11g_mode = sband->band == NL80211_BAND_2GHZ && have_higher_than_11mbit; return 0; } static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, struct cfg80211_bss *cbss) { struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; const struct element *ht_cap_elem, *vht_cap_elem; const struct cfg80211_bss_ies *ies; const struct ieee80211_ht_cap *ht_cap; const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct element *he_cap_elem; u16 mcs_80_map, mcs_160_map; int i, mcs_nss_size; bool support_160; u8 chains = 1; if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HT) return chains; ht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_CAPABILITY); if (ht_cap_elem && ht_cap_elem->datalen >= sizeof(*ht_cap)) { ht_cap = (void *)ht_cap_elem->data; chains = ieee80211_mcs_to_chains(&ht_cap->mcs); /* * TODO: use "Tx Maximum Number Spatial Streams Supported" and * "Tx Unequal Modulation Supported" fields. */ } if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_VHT) return chains; vht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); if (vht_cap_elem && vht_cap_elem->datalen >= sizeof(*vht_cap)) { u8 nss; u16 tx_mcs_map; vht_cap = (void *)vht_cap_elem->data; tx_mcs_map = le16_to_cpu(vht_cap->supp_mcs.tx_mcs_map); for (nss = 8; nss > 0; nss--) { if (((tx_mcs_map >> (2 * (nss - 1))) & 3) != IEEE80211_VHT_MCS_NOT_SUPPORTED) break; } /* TODO: use "Tx Highest Supported Long GI Data Rate" field? */ chains = max(chains, nss); } if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HE) return chains; ies = rcu_dereference(cbss->ies); he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap) + 1) return chains; /* skip one byte ext_tag_id */ he_cap = (void *)(he_cap_elem->data + 1); mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap); /* invalid HE IE */ if (he_cap_elem->datalen < 1 + mcs_nss_size + sizeof(*he_cap)) return chains; /* mcs_nss is right after he_cap info */ he_mcs_nss_supp = (void *)(he_cap + 1); mcs_80_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_80 = mcs_80_map >> (2 * i) & 3; if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { chains = max_t(u8, chains, i + 1); break; } } support_160 = he_cap->phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (!support_160) return chains; mcs_160_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_160); for (i = 7; i >= 0; i--) { u8 mcs_160 = mcs_160_map >> (2 * i) & 3; if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { chains = max_t(u8, chains, i + 1); break; } } return chains; } static void ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_assoc_request *req, bool wmm_used, int link_id, struct ieee80211_conn_settings *conn) { struct ieee80211_sta_ht_cap sta_ht_cap = sband->ht_cap; bool is_5ghz = sband->band == NL80211_BAND_5GHZ; bool is_6ghz = sband->band == NL80211_BAND_6GHZ; const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_sta_vht_cap vht_cap; if (sband->band == NL80211_BAND_S1GHZ) { conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_dbg(sdata, "operating as S1G STA\n"); return; } conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); if (req && req->flags & ASSOC_REQ_DISABLE_HT) { mlme_link_id_dbg(sdata, link_id, "HT disabled by flag, limiting to legacy\n"); goto out; } if (!wmm_used) { mlme_link_id_dbg(sdata, link_id, "WMM/QoS not supported, limiting to legacy\n"); goto out; } if (req) { unsigned int i; for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) { if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { netdev_info(sdata->dev, "WEP/TKIP use, limiting to legacy\n"); goto out; } } } if (!sta_ht_cap.ht_supported && !is_6ghz) { mlme_link_id_dbg(sdata, link_id, "HT not supported (and not on 6 GHz), limiting to legacy\n"); goto out; } /* HT is fine */ conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_CONN_BW_LIMIT_40 : IEEE80211_CONN_BW_LIMIT_20; memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); if (req && req->flags & ASSOC_REQ_DISABLE_VHT) { mlme_link_id_dbg(sdata, link_id, "VHT disabled by flag, limiting to HT\n"); goto out; } if (vht_cap.vht_supported && is_5ghz) { bool have_80mhz = false; unsigned int i; if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) { mlme_link_id_dbg(sdata, link_id, "no 40 MHz support on 5 GHz, limiting to HT\n"); goto out; } /* Allow VHT if at least one channel on the sband supports 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) { mlme_link_id_dbg(sdata, link_id, "no 80 MHz channel support on 5 GHz, limiting to HT\n"); goto out; } } else if (is_5ghz) { /* !vht_supported but on 5 GHz */ mlme_link_id_dbg(sdata, link_id, "no VHT support on 5 GHz, limiting to HT\n"); goto out; } /* VHT - if we have - is fine, including 80 MHz, check 160 below again */ if (sband->band != NL80211_BAND_2GHZ) { conn->mode = IEEE80211_CONN_MODE_VHT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; } if (is_5ghz && !(vht_cap.cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ))) { conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; mlme_link_id_dbg(sdata, link_id, "no VHT 160 MHz capability on 5 GHz, limiting to 80 MHz"); } if (req && req->flags & ASSOC_REQ_DISABLE_HE) { mlme_link_id_dbg(sdata, link_id, "HE disabled by flag, limiting to HT/VHT\n"); goto out; } he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) { WARN_ON(is_6ghz); mlme_link_id_dbg(sdata, link_id, "no HE support, limiting to HT/VHT\n"); goto out; } /* so we have HE */ conn->mode = IEEE80211_CONN_MODE_HE; /* check bandwidth */ switch (sband->band) { default: case NL80211_BAND_2GHZ: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) break; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_link_id_dbg(sdata, link_id, "no 40 MHz HE cap in 2.4 GHz, limiting to 20 MHz\n"); break; case NL80211_BAND_5GHZ: if (!(he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)) { conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_link_id_dbg(sdata, link_id, "no 40/80 MHz HE cap in 5 GHz, limiting to 20 MHz\n"); break; } if (!(he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)) { conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_80); mlme_link_id_dbg(sdata, link_id, "no 160 MHz HE cap in 5 GHz, limiting to 80 MHz\n"); } break; case NL80211_BAND_6GHZ: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) break; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_80); mlme_link_id_dbg(sdata, link_id, "no 160 MHz HE cap in 6 GHz, limiting to 80 MHz\n"); break; } if (req && req->flags & ASSOC_REQ_DISABLE_EHT) { mlme_link_id_dbg(sdata, link_id, "EHT disabled by flag, limiting to HE\n"); goto out; } eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!eht_cap) { mlme_link_id_dbg(sdata, link_id, "no EHT support, limiting to HE\n"); goto out; } /* we have EHT */ conn->mode = IEEE80211_CONN_MODE_EHT; /* check bandwidth */ if (is_6ghz && eht_cap->eht_cap_elem.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) conn->bw_limit = IEEE80211_CONN_BW_LIMIT_320; else if (is_6ghz) mlme_link_id_dbg(sdata, link_id, "no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n"); out: mlme_link_id_dbg(sdata, link_id, "determined local STA to be %s, BW limited to %d MHz\n", ieee80211_conn_mode_str(conn->mode), 20 * (1 << conn->bw_limit)); } static void ieee80211_determine_our_sta_mode_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_auth_request *req, bool wmm_used, struct ieee80211_conn_settings *conn) { ieee80211_determine_our_sta_mode(sdata, sband, NULL, wmm_used, req->link_id > 0 ? req->link_id : 0, conn); } static void ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_assoc_request *req, bool wmm_used, int link_id, struct ieee80211_conn_settings *conn) { struct ieee80211_conn_settings tmp; WARN_ON(!req); ieee80211_determine_our_sta_mode(sdata, sband, req, wmm_used, link_id, &tmp); conn->mode = min_t(enum ieee80211_conn_mode, conn->mode, tmp.mode); conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, tmp.bw_limit); } static enum ieee80211_ap_reg_power ieee80211_ap_power_type(u8 control) { switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; default: return IEEE80211_REG_UNSET_AP; } } static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, struct cfg80211_bss *cbss, bool mlo, struct ieee80211_conn_settings *conn, unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; struct ieee802_11_elems *elems; int ret; lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id, &chanreq, &ap_chandef, userspace_selectors); if (IS_ERR(elems)) { rcu_read_unlock(); return PTR_ERR(elems); } if (mlo && !elems->ml_basic) { sdata_info(sdata, "Rejecting MLO as it is not supported by AP\n"); rcu_read_unlock(); kfree(elems); return -EINVAL; } if (link && is_6ghz && conn->mode >= IEEE80211_CONN_MODE_HE) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; if (elems->pwr_constr_elem) link->conf->pwr_reduction = *elems->pwr_constr_elem; he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); if (he_6ghz_oper) link->conf->power_type = ieee80211_ap_power_type(he_6ghz_oper->control); else link_info(link, "HE 6 GHz operation missing (on %d MHz), expect issues\n", cbss->channel->center_freq); link->conf->tpe = elems->tpe; ieee80211_rearrange_tpe(&link->conf->tpe, &ap_chandef, &chanreq.oper); } rcu_read_unlock(); /* the element data was RCU protected so no longer valid anyway */ kfree(elems); elems = NULL; if (!link) return 0; rcu_read_lock(); link->needed_rx_chains = min(ieee80211_max_rx_chains(link, cbss), local->rx_chains); rcu_read_unlock(); /* * If this fails (possibly due to channel context sharing * on incompatible channels, e.g. 80+80 and 160 sharing the * same control channel) try to use a smaller bandwidth. */ ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); /* don't downgrade for 5 and 10 MHz channels, though. */ if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || chanreq.oper.width == NL80211_CHAN_WIDTH_10) return ret; while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) { ieee80211_chanreq_downgrade(&chanreq, conn); ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); } return ret; } static bool ieee80211_get_dtim(const struct cfg80211_bss_ies *ies, u8 *dtim_count, u8 *dtim_period) { const u8 *tim_ie = cfg80211_find_ie(WLAN_EID_TIM, ies->data, ies->len); const u8 *idx_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, ies->data, ies->len); const struct ieee80211_tim_ie *tim = NULL; const struct ieee80211_bssid_index *idx; bool valid = tim_ie && tim_ie[1] >= 2; if (valid) tim = (void *)(tim_ie + 2); if (dtim_count) *dtim_count = valid ? tim->dtim_count : 0; if (dtim_period) *dtim_period = valid ? tim->dtim_period : 0; /* Check if value is overridden by non-transmitted profile */ if (!idx_ie || idx_ie[1] < 3) return valid; idx = (void *)(idx_ie + 2); if (dtim_count) *dtim_count = idx->dtim_count; if (dtim_period) *dtim_period = idx->dtim_period; return true; } static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, struct ieee802_11_elems *elems, const u8 *elem_start, unsigned int elem_len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_local *local = sdata->local; unsigned int link_id; struct sta_info *sta; u64 changed[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u16 valid_links = 0, dormant_links = 0; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * station info was already allocated and inserted before * the association and should be available to us */ sta = sta_info_get(sdata, assoc_data->ap_addr); if (WARN_ON(!sta)) goto out_err; sta->sta.spp_amsdu = assoc_data->spp_amsdu; if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!assoc_data->link[link_id].bss) continue; valid_links |= BIT(link_id); if (assoc_data->link[link_id].disabled) dormant_links |= BIT(link_id); if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_allocate_link(sta, link_id); if (err) goto out_err; } } ieee80211_vif_set_links(sdata, valid_links, dormant_links); } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) goto out_err; if (ieee80211_vif_is_mld(&sdata->vif)) link_info(link, "local address %pM, AP link address %pM%s\n", link->conf->addr, assoc_data->link[link_id].bss->bssid, link_id == assoc_data->assoc_link_id ? " (assoc)" : ""); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) goto out_err; if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->beacon_ies); if (ies) link->u.mgd.have_beacon = true; else ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); } link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; if (link_id != assoc_data->assoc_link_id) { link->u.mgd.conn = assoc_data->link[link_id].conn; err = ieee80211_prep_channel(sdata, link, link_id, cbss, true, &link->u.mgd.conn, sdata->u.mgd.userspace_selectors); if (err) { link_info(link, "prep_channel failed\n"); goto out_err; } } err = ieee80211_mgd_setup_link_sta(link, sta, link_sta, assoc_data->link[link_id].bss); if (err) goto out_err; if (!ieee80211_assoc_config_link(link, link_sta, assoc_data->link[link_id].bss, mgmt, elem_start, elem_len, &changed[link_id])) goto out_err; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { valid_links &= ~BIT(link_id); ieee80211_sta_remove_link(sta, link_id); continue; } if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_activate_link(sta, link_id); if (err) goto out_err; } } /* links might have changed due to rejected ones, set them again */ ieee80211_vif_set_links(sdata, valid_links, dormant_links); rate_control_rate_init_all_links(sta); if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { set_sta_flag(sta, WLAN_STA_MFP); sta->sta.mfp = true; } else { sta->sta.mfp = false; } ieee80211_sta_set_max_amsdu_subframes(sta, elems->ext_capab, elems->ext_capab_len); sta->sta.wme = (elems->wmm_param || elems->s1g_capab) && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) err = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); if (err) { sdata_info(sdata, "failed to move station %pM to desired state\n", sta->sta.addr); WARN_ON(__sta_info_destroy(sta)); goto out_err; } if (sdata->wdev.use_4addr) drv_sta_set_4addr(local, sdata, &sta->sta, true); ieee80211_set_associated(sdata, assoc_data, changed); /* * If we're using 4-addr mode, let the AP know that we're * doing so, so that it can create the STA VLAN on its side */ if (ifmgd->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); /* * Start timer to probe the connection to the AP now. * Also start the timer that will detect beacon loss. */ ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); return true; out_err: eth_zero_addr(sdata->vif.cfg.ap_addr); return false; } static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee80211_elems_parse_params parse_params = { .bss = NULL, .link_id = -1, .from_ap = true, }; struct ieee802_11_elems *elems; int ac; const u8 *elem_start; unsigned int elem_len; bool reassoc; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, }; struct ieee80211_prep_tx_info info = {}; struct cfg80211_rx_assoc_resp_data resp = { .uapsd_queues = -1, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!assoc_data) return; info.link_id = assoc_data->assoc_link_id; parse_params.mode = assoc_data->link[assoc_data->assoc_link_id].conn.mode; if (!ether_addr_equal(assoc_data->ap_addr, mgmt->bssid) || !ether_addr_equal(assoc_data->ap_addr, mgmt->sa)) return; /* * AssocResp and ReassocResp have identical structure, so process both * of them in this function. */ if (len < 24 + 6) return; reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; } else { elem_start = mgmt->u.assoc_resp.variable; } /* * Note: this may not be perfect, AP might misbehave - if * anyone needs to rely on perfect complete notification * with the exact right subtype, then we need to track what * we actually transmitted. */ info.subtype = reassoc ? IEEE80211_STYPE_REASSOC_REQ : IEEE80211_STYPE_ASSOC_REQ; if (assoc_data->fils_kek_len && fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0) return; elem_len = len - (elem_start - (u8 *)mgmt); parse_params.start = elem_start; parse_params.len = elem_len; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) goto notify_driver; if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* * The 5 MSB of the AID field are reserved for a non-S1G STA. For * an S1G STA the 3 MSBs are reserved. * (802.11-2016 9.4.1.8 AID field). */ aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", reassoc ? "Rea" : "A", assoc_data->ap_addr, capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); ifmgd->broken_ap = false; if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && elems->timeout_int && elems->timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) { u32 tu, ms; cfg80211_assoc_comeback(sdata->dev, assoc_data->ap_addr, le32_to_cpu(elems->timeout_int->value)); tu = le32_to_cpu(elems->timeout_int->value); ms = tu * 1024 / 1000; sdata_info(sdata, "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n", assoc_data->ap_addr, tu, ms); assoc_data->timeout = jiffies + msecs_to_jiffies(ms); assoc_data->timeout_started = true; assoc_data->comeback = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(sdata, assoc_data->timeout); goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { sdata_info(sdata, "%pM denied association (code=%d)\n", assoc_data->ap_addr, status_code); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); aid = 0; ifmgd->broken_ap = true; } if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_mle_basic_common_info *common; if (!elems->ml_basic) { sdata_info(sdata, "MLO association with %pM but no (basic) multi-link element in response!\n", assoc_data->ap_addr); goto abandon_assoc; } common = (void *)elems->ml_basic->variable; if (memcmp(assoc_data->ap_addr, common->mld_mac_addr, ETH_ALEN)) { sdata_info(sdata, "AP MLD MAC address mismatch: got %pM expected %pM\n", common->mld_mac_addr, assoc_data->ap_addr); goto abandon_assoc; } sdata->vif.cfg.eml_cap = ieee80211_mle_get_eml_cap((const void *)elems->ml_basic); sdata->vif.cfg.eml_med_sync_delay = ieee80211_mle_get_eml_med_sync_delay((const void *)elems->ml_basic); sdata->vif.cfg.mld_capa_op = ieee80211_mle_get_mld_capa_op((const void *)elems->ml_basic); } sdata->vif.cfg.aid = aid; sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; drv_event_callback(sdata->local, sdata, &event); sdata_info(sdata, "associated\n"); info.success = 1; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; if (!assoc_data->link[link_id].bss) continue; resp.links[link_id].bss = assoc_data->link[link_id].bss; ether_addr_copy(resp.links[link_id].addr, assoc_data->link[link_id].addr); resp.links[link_id].status = assoc_data->link[link_id].status; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (link->tx_conf[ac].uapsd) resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; } if (ieee80211_vif_is_mld(&sdata->vif)) { ether_addr_copy(ap_mld_addr, sdata->vif.cfg.ap_addr); resp.ap_mld_addr = ap_mld_addr; } ieee80211_destroy_assoc_data(sdata, status_code == WLAN_STATUS_SUCCESS ? ASSOC_SUCCESS : ASSOC_REJECTED); resp.buf = (u8 *)mgmt; resp.len = len; resp.req_ies = ifmgd->assoc_req_ies; resp.req_ies_len = ifmgd->assoc_req_ies_len; cfg80211_rx_assoc_resp(sdata->dev, &resp); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); kfree(elems); return; abandon_assoc: ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); goto notify_driver; } static void ieee80211_rx_bss_info(struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; struct ieee80211_channel *channel; lockdep_assert_wiphy(sdata->local->hw.wiphy); channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, channel); if (bss) { link->conf->beacon_rate = bss->beacon_rate; ieee80211_rx_bss_put(local, bss); } } static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_link_data *link, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_if_managed *ifmgd; struct ieee80211_rx_status *rx_status = (void *) skb->cb; struct ieee80211_channel *channel; size_t baselen, len = skb->len; ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * According to Draft P802.11ax D6.0 clause 26.17.2.3.2: * "If a 6 GHz AP receives a Probe Request frame and responds with * a Probe Response frame [..], the Address 1 field of the Probe * Response frame shall be set to the broadcast address [..]" * So, on 6GHz band we should also accept broadcast responses. */ channel = ieee80211_get_channel(sdata->local->hw.wiphy, rx_status->freq); if (!channel) return; if (!ether_addr_equal(mgmt->da, sdata->vif.addr) && (channel->band != NL80211_BAND_6GHZ || !is_broadcast_ether_addr(mgmt->da))) return; /* ignore ProbeResp to foreign address */ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; if (baselen > len) return; ieee80211_rx_bss_info(link, mgmt, len, rx_status); if (ifmgd->associated && ether_addr_equal(mgmt->bssid, link->u.mgd.bssid)) ieee80211_reset_ap_probe(sdata); } /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI * (00:50:F2) vendor IE which is used for WMM which we need to track, * as well as the DTPC IE (part of the Cisco OUI) used for signaling * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace * will not be able to tell whether the hardware supports it or not. * * XXX: This list needs to be dynamic -- userspace needs to be able to * add items it requires. It also needs to be able to tell us to * look out for other vendor IEs. */ static const u64 care_about_ies = (1ULL << WLAN_EID_COUNTRY) | (1ULL << WLAN_EID_ERP_INFO) | (1ULL << WLAN_EID_CHANNEL_SWITCH) | (1ULL << WLAN_EID_PWR_CONSTRAINT) | (1ULL << WLAN_EID_HT_CAPABILITY) | (1ULL << WLAN_EID_HT_OPERATION) | (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); static void ieee80211_handle_beacon_sig(struct ieee80211_link_data *link, struct ieee80211_if_managed *ifmgd, struct ieee80211_bss_conf *bss_conf, struct ieee80211_local *local, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; /* Track average RSSI from the Beacon frames of the current AP */ if (!link->u.mgd.tracking_signal_avg) { link->u.mgd.tracking_signal_avg = true; ewma_beacon_signal_init(&link->u.mgd.ave_beacon_signal); link->u.mgd.last_cqm_event_signal = 0; link->u.mgd.count_beacon_signal = 1; link->u.mgd.last_ave_beacon_signal = 0; } else { link->u.mgd.count_beacon_signal++; } ewma_beacon_signal_add(&link->u.mgd.ave_beacon_signal, -rx_status->signal); if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_sig = link->u.mgd.last_ave_beacon_signal; struct ieee80211_event event = { .type = RSSI_EVENT, }; /* * if signal crosses either of the boundaries, invoke callback * with appropriate parameters */ if (sig > ifmgd->rssi_max_thold && (last_sig <= ifmgd->rssi_min_thold || last_sig == 0)) { link->u.mgd.last_ave_beacon_signal = sig; event.u.rssi.data = RSSI_EVENT_HIGH; drv_event_callback(local, sdata, &event); } else if (sig < ifmgd->rssi_min_thold && (last_sig >= ifmgd->rssi_max_thold || last_sig == 0)) { link->u.mgd.last_ave_beacon_signal = sig; event.u.rssi.data = RSSI_EVENT_LOW; drv_event_callback(local, sdata, &event); } } if (bss_conf->cqm_rssi_thold && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_event = link->u.mgd.last_cqm_event_signal; int thold = bss_conf->cqm_rssi_thold; int hyst = bss_conf->cqm_rssi_hyst; if (sig < thold && (last_event == 0 || sig < last_event - hyst)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, sig, GFP_KERNEL); } else if (sig > thold && (last_event == 0 || sig > last_event + hyst)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, sig, GFP_KERNEL); } } if (bss_conf->cqm_rssi_low && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_event = link->u.mgd.last_cqm_event_signal; int low = bss_conf->cqm_rssi_low; int high = bss_conf->cqm_rssi_high; if (sig < low && (last_event == 0 || last_event >= low)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, sig, GFP_KERNEL); } else if (sig > high && (last_event == 0 || last_event <= high)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, sig, GFP_KERNEL); } } } static bool ieee80211_rx_our_beacon(const u8 *tx_bssid, struct cfg80211_bss *bss) { if (ether_addr_equal(tx_bssid, bss->bssid)) return true; if (!bss->transmitted_bss) return false; return ether_addr_equal(tx_bssid, bss->transmitted_bss->bssid); } static void ieee80211_ml_reconf_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ml_reconf_work.work); u16 new_valid_links, new_active_links, new_dormant_links; int ret; if (!sdata->u.mgd.removed_links) return; sdata_info(sdata, "MLO Reconfiguration: work: valid=0x%x, removed=0x%x\n", sdata->vif.valid_links, sdata->u.mgd.removed_links); new_valid_links = sdata->vif.valid_links & ~sdata->u.mgd.removed_links; if (new_valid_links == sdata->vif.valid_links) return; if (!new_valid_links || !(new_valid_links & ~sdata->vif.dormant_links)) { sdata_info(sdata, "No valid links after reconfiguration\n"); ret = -EINVAL; goto out; } new_active_links = sdata->vif.active_links & ~sdata->u.mgd.removed_links; if (new_active_links != sdata->vif.active_links) { if (!new_active_links) new_active_links = BIT(ffs(new_valid_links & ~sdata->vif.dormant_links) - 1); ret = ieee80211_set_active_links(&sdata->vif, new_active_links); if (ret) { sdata_info(sdata, "Failed setting active links\n"); goto out; } } new_dormant_links = sdata->vif.dormant_links & ~sdata->u.mgd.removed_links; ret = ieee80211_vif_set_links(sdata, new_valid_links, new_dormant_links); if (ret) sdata_info(sdata, "Failed setting valid links\n"); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); out: if (!ret) cfg80211_links_removed(sdata->dev, sdata->u.mgd.removed_links); else __ieee80211_disconnect(sdata); sdata->u.mgd.removed_links = 0; } static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { const struct element *sub; unsigned long removed_links = 0; u16 link_removal_timeout[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u8 link_id; u32 delay; if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_reconf) return; /* Directly parse the sub elements as the common information doesn't * hold any useful information. */ for_each_mle_subelement(sub, (const u8 *)elems->ml_reconf, elems->ml_reconf_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; u8 *pos = prof->variable; u16 control; if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) continue; if (!ieee80211_mle_reconf_sta_prof_size_ok(sub->data, sub->datalen)) return; control = le16_to_cpu(prof->control); link_id = control & IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID; removed_links |= BIT(link_id); /* the MAC address should not be included, but handle it */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) pos += 6; /* According to Draft P802.11be_D3.0, the control should * include the AP Removal Timer present. If the AP Removal Timer * is not present assume immediate removal. */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) link_removal_timeout[link_id] = get_unaligned_le16(pos); } removed_links &= sdata->vif.valid_links; if (!removed_links) { /* In case the removal was cancelled, abort it */ if (sdata->u.mgd.removed_links) { sdata->u.mgd.removed_links = 0; wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); } return; } delay = 0; for_each_set_bit(link_id, &removed_links, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *link_conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); u32 link_delay; if (!link_conf) { removed_links &= ~BIT(link_id); continue; } if (link_removal_timeout[link_id] < 1) link_delay = 0; else link_delay = link_conf->beacon_int * (link_removal_timeout[link_id] - 1); if (!delay) delay = link_delay; else delay = min(delay, link_delay); } sdata->u.mgd.removed_links = removed_links; wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work, us_to_ktime(ieee80211_tu_to_usec(delay))); } static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, u16 active_links, u16 dormant_links, u16 suspended_links) { u64 changed = 0; int ret; if (!active_links) { ret = -EINVAL; goto out; } /* If there is an active negotiated TTLM, it should be discarded by * the new negotiated/advertised TTLM. */ if (sdata->vif.neg_ttlm.valid) { memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); sdata->vif.suspended_links = 0; changed = BSS_CHANGED_MLD_TTLM; } if (sdata->vif.active_links != active_links) { /* usable links are affected when active_links are changed, * so notify the driver about the status change */ changed |= BSS_CHANGED_MLD_VALID_LINKS; active_links &= sdata->vif.active_links; if (!active_links) active_links = BIT(__ffs(sdata->vif.valid_links & ~dormant_links)); ret = ieee80211_set_active_links(&sdata->vif, active_links); if (ret) { sdata_info(sdata, "Failed to set TTLM active links\n"); goto out; } } ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, dormant_links); if (ret) { sdata_info(sdata, "Failed to set TTLM dormant links\n"); goto out; } sdata->vif.suspended_links = suspended_links; if (sdata->vif.suspended_links) changed |= BSS_CHANGED_MLD_TTLM; ieee80211_vif_cfg_change_notify(sdata, changed); out: if (ret) ieee80211_disconnect(&sdata->vif, false); return ret; } static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy, struct wiphy_work *work) { u16 new_active_links, new_dormant_links; struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ttlm_work.work); new_active_links = sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; new_dormant_links = ~sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, 0)) return; sdata->u.mgd.ttlm_info.active = true; sdata->u.mgd.ttlm_info.switch_time = 0; } static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data) { if (bm_size == 1) return *data; else return get_unaligned_le16(data); } static int ieee80211_parse_adv_t2l(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ttlm_elem *ttlm, struct ieee80211_adv_ttlm_info *ttlm_info) { /* The element size was already validated in * ieee80211_tid_to_link_map_size_ok() */ u8 control, link_map_presence, map_size, tid; u8 *pos; memset(ttlm_info, 0, sizeof(*ttlm_info)); pos = (void *)ttlm->optional; control = ttlm->control; if ((control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) || !(control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)) return 0; if ((control & IEEE80211_TTLM_CONTROL_DIRECTION) != IEEE80211_TTLM_DIRECTION_BOTH) { sdata_info(sdata, "Invalid advertised T2L map direction\n"); return -EINVAL; } link_map_presence = *pos; pos++; ttlm_info->switch_time = get_unaligned_le16(pos); /* Since ttlm_info->switch_time == 0 means no switch time, bump it * by 1. */ if (!ttlm_info->switch_time) ttlm_info->switch_time = 1; pos += 2; if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) { ttlm_info->duration = pos[0] | pos[1] << 8 | pos[2] << 16; pos += 3; } if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) map_size = 1; else map_size = 2; /* According to Draft P802.11be_D3.0 clause 35.3.7.1.7, an AP MLD shall * not advertise a TID-to-link mapping that does not map all TIDs to the * same link set, reject frame if not all links have mapping */ if (link_map_presence != 0xff) { sdata_info(sdata, "Invalid advertised T2L mapping presence indicator\n"); return -EINVAL; } ttlm_info->map = ieee80211_get_ttlm(map_size, pos); if (!ttlm_info->map) { sdata_info(sdata, "Invalid advertised T2L map for TID 0\n"); return -EINVAL; } pos += map_size; for (tid = 1; tid < 8; tid++) { u16 map = ieee80211_get_ttlm(map_size, pos); if (map != ttlm_info->map) { sdata_info(sdata, "Invalid advertised T2L map for tid %d\n", tid); return -EINVAL; } pos += map_size; } return 0; } static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, u64 beacon_ts) { u8 i; int ret; if (!ieee80211_vif_is_mld(&sdata->vif)) return; if (!elems->ttlm_num) { if (sdata->u.mgd.ttlm_info.switch_time) { /* if a planned TID-to-link mapping was cancelled - * abort it */ wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); } else if (sdata->u.mgd.ttlm_info.active) { /* if no TID-to-link element, set to default mapping in * which all TIDs are mapped to all setup links */ ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); if (ret) { sdata_info(sdata, "Failed setting valid/dormant links\n"); return; } ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); } memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); return; } for (i = 0; i < elems->ttlm_num; i++) { struct ieee80211_adv_ttlm_info ttlm_info; u32 res; res = ieee80211_parse_adv_t2l(sdata, elems->ttlm[i], &ttlm_info); if (res) { __ieee80211_disconnect(sdata); return; } if (ttlm_info.switch_time) { u16 beacon_ts_tu, st_tu, delay; u64 delay_usec; u64 mask; /* The t2l map switch time is indicated with a partial * TSF value (bits 10 to 25), get the partial beacon TS * as well, and calc the delay to the start time. */ mask = GENMASK_ULL(25, 10); beacon_ts_tu = (beacon_ts & mask) >> 10; st_tu = ttlm_info.switch_time; delay = st_tu - beacon_ts_tu; /* * If the switch time is far in the future, then it * could also be the previous switch still being * announced. * We can simply ignore it for now, if it is a future * switch the AP will continue to announce it anyway. */ if (delay > IEEE80211_ADV_TTLM_ST_UNDERFLOW) return; delay_usec = ieee80211_tu_to_usec(delay); /* Link switching can take time, so schedule it * 100ms before to be ready on time */ if (delay_usec > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) delay_usec -= IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS; else delay_usec = 0; sdata->u.mgd.ttlm_info = ttlm_info; wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work, us_to_ktime(delay_usec)); return; } } } static void ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, int reporting_link_id, struct ieee802_11_elems *elems) { const struct element *sta_profiles[IEEE80211_MLD_MAX_NUM_LINKS] = {}; ssize_t sta_profiles_len[IEEE80211_MLD_MAX_NUM_LINKS] = {}; const struct element *sub; const u8 *subelems; size_t subelems_len; u8 common_size; int link_id; if (!ieee80211_mle_size_ok((u8 *)elems->ml_basic, elems->ml_basic_len)) return; common_size = ieee80211_mle_common_size((u8 *)elems->ml_basic); subelems = (u8 *)elems->ml_basic + common_size; subelems_len = elems->ml_basic_len - common_size; for_each_element_id(sub, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE, subelems, subelems_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; struct ieee80211_link_data *link; ssize_t len; if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, sub->datalen)) continue; link_id = le16_get_bits(prof->control, IEEE80211_MLE_STA_CONTROL_LINK_ID); /* need a valid link ID, but also not our own, both AP bugs */ if (link_id == reporting_link_id || link_id >= IEEE80211_MLD_MAX_NUM_LINKS) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; len = cfg80211_defragment_element(sub, subelems, subelems_len, NULL, 0, IEEE80211_MLE_SUBELEM_FRAGMENT); if (WARN_ON(len < 0)) continue; sta_profiles[link_id] = sub; sta_profiles_len[link_id] = len; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_mle_per_sta_profile *prof; struct ieee802_11_elems *prof_elems; struct ieee80211_link_data *link; ssize_t len; if (link_id == reporting_link_id) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (!sta_profiles[link_id]) { prof_elems = NULL; goto handle; } /* we can defragment in-place, won't use the buffer again */ len = cfg80211_defragment_element(sta_profiles[link_id], subelems, subelems_len, (void *)sta_profiles[link_id], sta_profiles_len[link_id], IEEE80211_MLE_SUBELEM_FRAGMENT); if (WARN_ON(len != sta_profiles_len[link_id])) continue; prof = (void *)sta_profiles[link_id]; prof_elems = ieee802_11_parse_elems(prof->variable + (prof->sta_info_len - 1), len - (prof->sta_info_len - 1), false, NULL); /* memory allocation failed - let's hope that's transient */ if (!prof_elems) continue; handle: /* * FIXME: the timings here are obviously incorrect, * but only older Intel drivers seem to care, and * those don't have MLO. If you really need this, * the problem is having to calculate it with the * TSF offset etc. The device_timestamp is still * correct, of course. */ ieee80211_sta_process_chanswitch(link, 0, 0, elems, prof_elems, IEEE80211_CSA_SOURCE_OTHER_LINK); kfree(prof_elems); } } static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, const struct ieee802_11_elems *elems) { struct ieee80211_vif_cfg *cfg = &sdata->vif.cfg; static u8 zero_ssid[IEEE80211_MAX_SSID_LEN]; if (!elems->ssid) return false; /* hidden SSID: zero length */ if (elems->ssid_len == 0) return false; if (elems->ssid_len != cfg->ssid_len) return true; /* hidden SSID: zeroed out */ if (!memcmp(elems->ssid, zero_ssid, elems->ssid_len)) return false; return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } static bool ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, struct ieee80211_rx_status *rx_status, struct ieee80211_chanctx_conf *chanctx) { u32 pri_2mhz_khz; struct ieee80211_channel *s1g_sibling_1mhz; u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); if (rx_khz == pri_khz) return true; if (!chanctx->def.s1g_primary_2mhz) return false; /* * If we have an S1G interface with a 2MHz primary, beacons are * sent on the center frequency of the 2MHz primary. Find the sibling * 1MHz channel and calculate the 2MHz primary center frequency. */ s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, &chanctx->def); if (!s1g_sibling_1mhz) return false; pri_2mhz_khz = (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; return rx_khz == pri_2mhz_khz; } static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_mgmt *mgmt = (void *) hdr; struct ieee80211_ext *ext = NULL; size_t baselen; struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; struct link_sta_info *link_sta; struct sta_info *sta; u64 changed = 0; bool erp_valid; u8 erp_value = 0; u32 ncrc = 0; u8 *bssid, *variable = mgmt->u.beacon.variable; u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; struct ieee80211_elems_parse_params parse_params = { .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, }; lockdep_assert_wiphy(local->hw.wiphy); /* Process beacon from the current BSS */ bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *)mgmt; variable = ext->u.s1g_beacon.variable + ieee80211_s1g_optional_len(ext->frame_control); } baselen = (u8 *) variable - (u8 *) mgmt; if (baselen > len) return; parse_params.start = variable; parse_params.len = len - baselen; rcu_read_lock(); chanctx_conf = rcu_dereference(bss_conf->chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return; } if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, chanctx_conf)) { rcu_read_unlock(); return; } chan = chanctx_conf->def.chan; rcu_read_unlock(); if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && !WARN_ON(ieee80211_vif_is_mld(&sdata->vif)) && ieee80211_rx_our_beacon(bssid, ifmgd->assoc_data->link[0].bss)) { parse_params.bss = ifmgd->assoc_data->link[0].bss; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return; ieee80211_rx_bss_info(link, mgmt, len, rx_status); if (elems->dtim_period) link->u.mgd.dtim_period = elems->dtim_period; link->u.mgd.have_beacon = true; ifmgd->assoc_data->need_beacon = false; if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && !ieee80211_is_s1g_beacon(hdr->frame_control)) { bss_conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); bss_conf->sync_device_ts = rx_status->device_timestamp; bss_conf->sync_dtim_count = elems->dtim_count; } if (elems->mbssid_config_ie) bss_conf->profile_periodicity = elems->mbssid_config_ie->profile_periodicity; else bss_conf->profile_periodicity = 0; if (elems->ext_capab_len >= 11 && (elems->ext_capab[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) bss_conf->ema_ap = true; else bss_conf->ema_ap = false; /* continue assoc process */ ifmgd->assoc_data->timeout = jiffies; ifmgd->assoc_data->timeout_started = true; run_again(sdata, ifmgd->assoc_data->timeout); kfree(elems); return; } if (!ifmgd->associated || !ieee80211_rx_our_beacon(bssid, bss_conf->bss)) return; bssid = link->u.mgd.bssid; if (!(rx_status->flag & RX_FLAG_NO_SIGNAL_VAL)) ieee80211_handle_beacon_sig(link, ifmgd, bss_conf, local, rx_status); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) { mlme_dbg_ratelimited(sdata, "cancelling AP probe due to a received beacon\n"); ieee80211_reset_ap_probe(sdata); } /* * Push the beacon loss detection into the future since * we are processing a beacon from the AP just now. */ ieee80211_sta_reset_beacon_monitor(sdata); /* TODO: CRC urrently not calculated on S1G Beacon Compatibility * element (which carries the beacon interval). Don't forget to add a * bit to care_about_ies[] above if mac80211 is interested in a * changing S1G element. */ if (!ieee80211_is_s1g_beacon(hdr->frame_control)) ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); parse_params.bss = bss_conf->bss; parse_params.filter = care_about_ies; parse_params.crc = ncrc; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return; if (rx_status->flag & RX_FLAG_DECRYPTED && ieee80211_mgd_ssid_mismatch(sdata, elems)) { sdata_info(sdata, "SSID mismatch for AP %pM, disconnect\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); return; } ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_send_nullfunc(local, sdata, false); } else if (!local->pspolling && sdata->u.mgd.powersave) { local->pspolling = true; /* * Here is assumed that the driver will be * able to send ps-poll frame and receive a * response even though power save mode is * enabled, but some drivers might require * to disable power save here. This needs * to be investigated. */ ieee80211_send_pspoll(local, sdata); } } if (sdata->vif.p2p || sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { struct ieee80211_p2p_noa_attr noa = {}; int ret; ret = cfg80211_get_p2p_attr(variable, len - baselen, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, (u8 *) &noa, sizeof(noa)); if (ret >= 2) { if (link->u.mgd.p2p_noa_index != noa.index) { /* valid noa_attr and index changed */ link->u.mgd.p2p_noa_index = noa.index; memcpy(&bss_conf->p2p_noa_attr, &noa, sizeof(noa)); changed |= BSS_CHANGED_P2P_PS; /* * make sure we update all information, the CRC * mechanism doesn't look at P2P attributes. */ link->u.mgd.beacon_crc_valid = false; } } else if (link->u.mgd.p2p_noa_index != -1) { /* noa_attr not found and we had valid noa_attr before */ link->u.mgd.p2p_noa_index = -1; memset(&bss_conf->p2p_noa_attr, 0, sizeof(bss_conf->p2p_noa_attr)); changed |= BSS_CHANGED_P2P_PS; link->u.mgd.beacon_crc_valid = false; } } /* * Update beacon timing and dtim count on every beacon appearance. This * will allow the driver to use the most updated values. Do it before * comparing this one with last received beacon. * IMPORTANT: These parameters would possibly be out of sync by the time * the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. */ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && !ieee80211_is_s1g_beacon(hdr->frame_control)) { bss_conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); bss_conf->sync_device_ts = rx_status->device_timestamp; bss_conf->sync_dtim_count = elems->dtim_count; } if ((ncrc == link->u.mgd.beacon_crc && link->u.mgd.beacon_crc_valid) || (ext && ieee80211_is_s1g_short_beacon(ext->frame_control, parse_params.start, parse_params.len))) goto free; link->u.mgd.beacon_crc = ncrc; link->u.mgd.beacon_crc_valid = true; ieee80211_rx_bss_info(link, mgmt, len, rx_status); ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, IEEE80211_CSA_SOURCE_BEACON); /* note that after this elems->ml_basic can no longer be used fully */ ieee80211_mgd_check_cross_link_csa(sdata, rx_status->link_id, elems); ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); if (!sdata->u.mgd.epcs.enabled && !link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* * If we haven't had a beacon before, tell the driver about the * DTIM period (and beacon timing if desired) now. */ if (!link->u.mgd.have_beacon) { /* a few bogus AP send dtim_period = 0 or no TIM IE */ bss_conf->dtim_period = elems->dtim_period ?: 1; changed |= BSS_CHANGED_BEACON_INFO; link->u.mgd.have_beacon = true; ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } if (elems->erp_info) { erp_valid = true; erp_value = elems->erp_info[0]; } else { erp_valid = false; } if (!ieee80211_is_s1g_beacon(hdr->frame_control)) changed |= ieee80211_handle_bss_capability(link, le16_to_cpu(mgmt->u.beacon.capab_info), erp_valid, erp_value); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) { goto free; } link_sta = rcu_dereference_protected(sta->link[link->link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) { goto free; } if (WARN_ON(!bss_conf->chanreq.oper.chan)) goto free; sband = local->hw.wiphy->bands[bss_conf->chanreq.oper.chan->band]; changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (ieee80211_config_bw(link, elems, true, &changed, IEEE80211_STYPE_BEACON)) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); ieee80211_report_disconnect(sdata, deauth_buf, sizeof(deauth_buf), true, WLAN_REASON_DEAUTH_LEAVING, false); goto free; } if (elems->opmode_notif) ieee80211_vht_handle_opmode(sdata, link_sta, *elems->opmode_notif, rx_status->band); changed |= ieee80211_handle_pwr_constr(link, chan, mgmt, elems->country_elem, elems->country_elem_len, elems->pwr_constr_elem, elems->cisco_dtpc_elem); ieee80211_ml_reconfiguration(sdata, elems); ieee80211_process_adv_ttlm(sdata, elems, le64_to_cpu(mgmt->u.beacon.timestamp)); ieee80211_link_info_change_notify(sdata, link, changed); free: kfree(elems); } static void ieee80211_apply_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm neg_ttlm) { u16 new_active_links, new_dormant_links, new_suspended_links, map = 0; u8 i; for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) map |= neg_ttlm.downlink[i] | neg_ttlm.uplink[i]; /* If there is an active TTLM, unset previously suspended links */ if (sdata->vif.neg_ttlm.valid) sdata->vif.dormant_links &= ~sdata->vif.suspended_links; /* exclude links that are already disabled by advertised TTLM */ new_active_links = map & sdata->vif.valid_links & ~sdata->vif.dormant_links; new_suspended_links = (~map & sdata->vif.valid_links) & ~sdata->vif.dormant_links; new_dormant_links = sdata->vif.dormant_links | new_suspended_links; if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, new_suspended_links)) return; sdata->vif.neg_ttlm = neg_ttlm; sdata->vif.neg_ttlm.valid = true; } static void ieee80211_neg_ttlm_timeout_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.neg_ttlm_timeout_work.work); sdata_info(sdata, "No negotiated TTLM response from AP, disconnecting.\n"); __ieee80211_disconnect(sdata); } static void ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb, struct ieee80211_neg_ttlm *neg_ttlm) { u8 i, direction[IEEE80211_TTLM_MAX_CNT]; if (memcmp(neg_ttlm->downlink, neg_ttlm->uplink, sizeof(neg_ttlm->downlink))) { direction[0] = IEEE80211_TTLM_DIRECTION_DOWN; direction[1] = IEEE80211_TTLM_DIRECTION_UP; } else { direction[0] = IEEE80211_TTLM_DIRECTION_BOTH; } for (i = 0; i < ARRAY_SIZE(direction); i++) { u8 tid, len, map_ind = 0, *len_pos, *map_ind_pos, *pos; __le16 map; len = sizeof(struct ieee80211_ttlm_elem) + 1 + 1; pos = skb_put(skb, len + 2); *pos++ = WLAN_EID_EXTENSION; len_pos = pos++; *pos++ = WLAN_EID_EXT_TID_TO_LINK_MAPPING; *pos++ = direction[i]; map_ind_pos = pos++; for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { map = direction[i] == IEEE80211_TTLM_DIRECTION_UP ? cpu_to_le16(neg_ttlm->uplink[tid]) : cpu_to_le16(neg_ttlm->downlink[tid]); if (!map) continue; len += 2; map_ind |= BIT(tid); skb_put_data(skb, &map, sizeof(map)); } *map_ind_pos = map_ind; *len_pos = len; if (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH) break; } } static void ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm *neg_ttlm, u8 dialog_token) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_req.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; mgmt->u.action.u.ttlm_req.dialog_token = dialog_token; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); ieee80211_tx_skb(sdata, skb); } int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct cfg80211_ttlm_params *params) { struct ieee80211_neg_ttlm neg_ttlm = {}; u8 i; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->vif.cfg.mld_capa_op & IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP)) return -EINVAL; for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if ((params->dlink[i] & ~sdata->vif.valid_links) || (params->ulink[i] & ~sdata->vif.valid_links)) return -EINVAL; neg_ttlm.downlink[i] = params->dlink[i]; neg_ttlm.uplink[i] = params->ulink[i]; } if (drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm) != NEG_TTLM_RES_ACCEPT) return -EINVAL; ieee80211_apply_neg_ttlm(sdata, neg_ttlm); sdata->u.mgd.dialog_token_alloc++; ieee80211_send_neg_ttlm_req(sdata, &sdata->vif.neg_ttlm, sdata->u.mgd.dialog_token_alloc); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work); wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work, IEEE80211_NEG_TTLM_REQ_TIMEOUT); return 0; } static void ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, enum ieee80211_neg_ttlm_res ttlm_res, u8 dialog_token, struct ieee80211_neg_ttlm *neg_ttlm) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; u16 status_code; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_res.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_RES; mgmt->u.action.u.ttlm_res.dialog_token = dialog_token; switch (ttlm_res) { default: WARN_ON(1); fallthrough; case NEG_TTLM_RES_REJECT: status_code = WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; break; case NEG_TTLM_RES_ACCEPT: status_code = WLAN_STATUS_SUCCESS; break; case NEG_TTLM_RES_SUGGEST_PREFERRED: status_code = WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); break; } mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } static int ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ttlm_elem *ttlm, struct ieee80211_neg_ttlm *neg_ttlm, u8 *direction) { u8 control, link_map_presence, map_size, tid; u8 *pos; /* The element size was already validated in * ieee80211_tid_to_link_map_size_ok() */ pos = (void *)ttlm->optional; control = ttlm->control; /* mapping switch time and expected duration fields are not expected * in case of negotiated TTLM */ if (control & (IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT | IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)) { mlme_dbg(sdata, "Invalid TTLM element in negotiated TTLM request\n"); return -EINVAL; } if (control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) { for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { neg_ttlm->downlink[tid] = sdata->vif.valid_links; neg_ttlm->uplink[tid] = sdata->vif.valid_links; } *direction = IEEE80211_TTLM_DIRECTION_BOTH; return 0; } *direction = u8_get_bits(control, IEEE80211_TTLM_CONTROL_DIRECTION); if (*direction != IEEE80211_TTLM_DIRECTION_DOWN && *direction != IEEE80211_TTLM_DIRECTION_UP && *direction != IEEE80211_TTLM_DIRECTION_BOTH) return -EINVAL; link_map_presence = *pos; pos++; if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) map_size = 1; else map_size = 2; for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { u16 map; if (link_map_presence & BIT(tid)) { map = ieee80211_get_ttlm(map_size, pos); if (!map) { mlme_dbg(sdata, "No active links for TID %d", tid); return -EINVAL; } } else { map = 0; } switch (*direction) { case IEEE80211_TTLM_DIRECTION_BOTH: neg_ttlm->downlink[tid] = map; neg_ttlm->uplink[tid] = map; break; case IEEE80211_TTLM_DIRECTION_DOWN: neg_ttlm->downlink[tid] = map; break; case IEEE80211_TTLM_DIRECTION_UP: neg_ttlm->uplink[tid] = map; break; default: return -EINVAL; } pos += map_size; } return 0; } void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u8 dialog_token, direction[IEEE80211_TTLM_MAX_CNT] = {}, i; size_t ies_len; enum ieee80211_neg_ttlm_res ttlm_res = NEG_TTLM_RES_ACCEPT; struct ieee802_11_elems *elems = NULL; struct ieee80211_neg_ttlm neg_ttlm = {}; BUILD_BUG_ON(ARRAY_SIZE(direction) != ARRAY_SIZE(elems->ttlm)); if (!ieee80211_vif_is_mld(&sdata->vif)) return; dialog_token = mgmt->u.action.u.ttlm_req.dialog_token; ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.ttlm_req.variable); elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, ies_len, true, NULL); if (!elems) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } for (i = 0; i < elems->ttlm_num; i++) { if (ieee80211_parse_neg_ttlm(sdata, elems->ttlm[i], &neg_ttlm, &direction[i]) || (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH && elems->ttlm_num != 1)) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } } if (!elems->ttlm_num || (elems->ttlm_num == 2 && direction[0] == direction[1])) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if ((neg_ttlm.downlink[i] && (neg_ttlm.downlink[i] & ~sdata->vif.valid_links)) || (neg_ttlm.uplink[i] && (neg_ttlm.uplink[i] & ~sdata->vif.valid_links))) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } } ttlm_res = drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm); if (ttlm_res != NEG_TTLM_RES_ACCEPT) goto out; ieee80211_apply_neg_ttlm(sdata, neg_ttlm); out: kfree(elems); ieee80211_send_neg_ttlm_res(sdata, ttlm_res, dialog_token, &neg_ttlm); } void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || mgmt->u.action.u.ttlm_req.dialog_token != sdata->u.mgd.dialog_token_alloc) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work); /* MLD station sends a TID to link mapping request, mainly to handle * BTM (BSS transition management) request, in which case it needs to * restrict the active links set. * In this case it's not expected that the MLD AP will reject the * negotiated TTLM request. * This can be better implemented in the future, to handle request * rejections. */ if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } void ieee80211_process_ttlm_teardown(struct ieee80211_sub_if_data *sdata) { u16 new_dormant_links; if (!sdata->vif.neg_ttlm.valid) return; memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); new_dormant_links = sdata->vif.dormant_links & ~sdata->vif.suspended_links; sdata->vif.suspended_links = 0; ieee80211_vif_set_links(sdata, sdata->vif.valid_links, new_dormant_links); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_TTLM | BSS_CHANGED_MLD_VALID_LINKS); } static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.teardown_ttlm_work); ieee80211_process_ttlm_teardown(sdata); } void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int frame_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_tear_down); struct ieee80211_tx_info *info; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, frame_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_tear_down.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; info->status_data = IEEE80211_STATUS_TYPE_NEG_TTLM; ieee80211_tx_skb(sdata, skb); } EXPORT_SYMBOL(ieee80211_send_teardown_neg_ttlm); void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_rx_status *rx_status; struct ieee80211_hdr *hdr; u16 fc; lockdep_assert_wiphy(sdata->local->hw.wiphy); rx_status = (struct ieee80211_rx_status *) skb->cb; hdr = (struct ieee80211_hdr *) skb->data; fc = le16_to_cpu(hdr->frame_control); switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_S1G_BEACON: ieee80211_rx_mgmt_beacon(link, hdr, skb->len, rx_status); break; } } void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; struct ieee80211_mgmt *mgmt; u16 fc; int ies_len; lockdep_assert_wiphy(sdata->local->hw.wiphy); rx_status = (struct ieee80211_rx_status *) skb->cb; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); if (rx_status->link_valid) { link = sdata_dereference(sdata->link[rx_status->link_id], sdata); if (!link) return; } switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_BEACON: ieee80211_rx_mgmt_beacon(link, (void *)mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_PROBE_RESP: ieee80211_rx_mgmt_probe_resp(link, skb); break; case IEEE80211_STYPE_AUTH: ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DEAUTH: ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DISASSOC: ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ASSOC_RESP: case IEEE80211_STYPE_REASSOC_RESP: ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ACTION: if (!sdata->u.mgd.associated || !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) break; switch (mgmt->u.action.category) { case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); if (ies_len < 0) break; /* CSA IE cannot be overridden, no need for BSSID */ elems = ieee802_11_parse_elems( mgmt->u.action.u.chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src = IEEE80211_CSA_SOURCE_PROT_ACTION; ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, src); } kfree(elems); break; case WLAN_CATEGORY_PUBLIC: case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.ext_chan_switch.variable); if (ies_len < 0) break; /* * extended CSA IE can't be overridden, no need for * BSSID */ elems = ieee802_11_parse_elems( mgmt->u.action.u.ext_chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src; if (mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) src = IEEE80211_CSA_SOURCE_PROT_ACTION; else src = IEEE80211_CSA_SOURCE_UNPROT_ACTION; /* for the handling code pretend it was an IE */ elems->ext_chansw_ie = &mgmt->u.action.u.ext_chan_switch.data; ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, src); } kfree(elems); break; } break; } } static void ieee80211_sta_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = timer_container_of(sdata, t, u.mgd.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, u8 reason, bool tx) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, reason, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, reason, false); } static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data; u32 tx_flags = 0; u16 trans = 1; u16 status = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(!auth_data)) return -EINVAL; auth_data->tries++; if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) { sdata_info(sdata, "authentication with %pM timed out\n", auth_data->ap_addr); /* * Most likely AP is not in the range so remove the * bss struct for that AP. */ cfg80211_unlink_bss(local->hw.wiphy, auth_data->bss); return -ETIMEDOUT; } if (auth_data->algorithm == WLAN_AUTH_SAE) info.duration = jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); info.link_id = auth_data->link_id; drv_mgd_prepare_tx(local, sdata, &info); sdata_info(sdata, "send auth to %pM (try %d/%d)\n", auth_data->ap_addr, auth_data->tries, IEEE80211_AUTH_MAX_TRIES); auth_data->expected_transaction = 2; if (auth_data->algorithm == WLAN_AUTH_SAE) { trans = auth_data->sae_trans; status = auth_data->sae_status; auth_data->expected_transaction = trans; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, auth_data->data, auth_data->data_len, auth_data->ap_addr, auth_data->ap_addr, NULL, 0, 0, tx_flags); if (tx_flags == 0) { if (auth_data->algorithm == WLAN_AUTH_SAE) auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SAE; else auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; } else { auth_data->timeout = round_jiffies_up(jiffies + IEEE80211_AUTH_TIMEOUT_LONG); } auth_data->timeout_started = true; run_again(sdata, auth_data->timeout); return 0; } static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; struct ieee80211_local *local = sdata->local; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); assoc_data->tries++; assoc_data->comeback = false; if (assoc_data->tries > IEEE80211_ASSOC_MAX_TRIES) { sdata_info(sdata, "association with %pM timed out\n", assoc_data->ap_addr); /* * Most likely AP is not in the range so remove the * bss struct for that AP. */ cfg80211_unlink_bss(local->hw.wiphy, assoc_data->link[assoc_data->assoc_link_id].bss); return -ETIMEDOUT; } sdata_info(sdata, "associate with %pM (try %d/%d)\n", assoc_data->ap_addr, assoc_data->tries, IEEE80211_ASSOC_MAX_TRIES); ret = ieee80211_send_assoc(sdata); if (ret) return ret; if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } else { assoc_data->timeout = round_jiffies_up(jiffies + IEEE80211_ASSOC_TIMEOUT_LONG); assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } return 0; } void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked) { struct ieee80211_local *local = sdata->local; sdata->u.mgd.status_fc = fc; sdata->u.mgd.status_acked = acked; sdata->u.mgd.status_received = true; wiphy_work_queue(local->hw.wiphy, &sdata->work); } void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifmgd->status_received) { __le16 fc = ifmgd->status_fc; bool status_acked = ifmgd->status_acked; ifmgd->status_received = false; if (ifmgd->auth_data && ieee80211_is_auth(fc)) { if (status_acked) { if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE) ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SAE; else ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SHORT; run_again(sdata, ifmgd->auth_data->timeout); } else { ifmgd->auth_data->timeout = jiffies - 1; } ifmgd->auth_data->timeout_started = true; } else if (ifmgd->assoc_data && !ifmgd->assoc_data->comeback && (ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc))) { /* * Update association timeout based on the TX status * for the (Re)Association Request frame. Skip this if * we have already processed a (Re)Association Response * frame that indicated need for association comeback * at a specific time in the future. This could happen * if the TX status information is delayed enough for * the response to be received and processed first. */ if (status_acked) { ifmgd->assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT_SHORT; run_again(sdata, ifmgd->assoc_data->timeout); } else { ifmgd->assoc_data->timeout = jiffies - 1; } ifmgd->assoc_data->timeout_started = true; } } if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { if (ifmgd->auth_data->done || ifmgd->auth_data->waiting) { /* * ok ... we waited for assoc or continuation but * userspace didn't do it, so kill the auth data */ ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_auth(sdata)) { u8 ap_addr[ETH_ALEN]; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, .u.mlme.status = MLME_TIMEOUT, }; memcpy(ap_addr, ifmgd->auth_data->ap_addr, ETH_ALEN); ieee80211_destroy_auth_data(sdata, false); cfg80211_auth_timeout(sdata->dev, ap_addr); drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->auth_data && ifmgd->auth_data->timeout_started) run_again(sdata, ifmgd->auth_data->timeout); if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started && time_after(jiffies, ifmgd->assoc_data->timeout)) { if ((ifmgd->assoc_data->need_beacon && !sdata->deflink.u.mgd.have_beacon) || ieee80211_do_assoc(sdata)) { struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, .u.mlme.status = MLME_TIMEOUT, }; ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started) run_again(sdata, ifmgd->assoc_data->timeout); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL && ifmgd->associated) { u8 *bssid = sdata->deflink.u.mgd.bssid; int max_tries; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) max_tries = max_nullfunc_tries; else max_tries = max_probe_tries; /* ACK received for nullfunc probing frame */ if (!ifmgd->probe_send_count) ieee80211_reset_ap_probe(sdata); else if (ifmgd->nullfunc_failed) { if (ifmgd->probe_send_count < max_tries) { mlme_dbg(sdata, "No ack for nullfunc frame to AP %pM, try %d/%i\n", bssid, ifmgd->probe_send_count, max_tries); ieee80211_mgd_probe_ap_send(sdata); } else { mlme_dbg(sdata, "No ack for nullfunc frame to AP %pM, disconnecting.\n", bssid); ieee80211_sta_connection_lost(sdata, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } } else if (time_is_after_jiffies(ifmgd->probe_timeout)) run_again(sdata, ifmgd->probe_timeout); else if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { mlme_dbg(sdata, "Failed to send nullfunc to AP %pM after %dms, disconnecting\n", bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } else if (ifmgd->probe_send_count < max_tries) { mlme_dbg(sdata, "No probe response from AP %pM after %dms, try %d/%i\n", bssid, probe_wait_ms, ifmgd->probe_send_count, max_tries); ieee80211_mgd_probe_ap_send(sdata); } else { /* * We actually lost the connection ... or did we? * Let's make sure! */ mlme_dbg(sdata, "No probe response from AP %pM after %dms, disconnecting.\n", bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } } } static bool ieee80211_is_csa_in_progress(struct ieee80211_sub_if_data *sdata) { /* * In MLO, check the CSA flags 'active' and 'waiting_bcn' for all * the links. */ struct ieee80211_link_data *link; guard(rcu)(); for_each_link_data_rcu(sdata, link) { if (!(link->conf->csa_active && !link->u.mgd.csa.waiting_bcn)) return false; } return true; } static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = timer_container_of(sdata, t, u.mgd.bcn_mon_timer); if (ieee80211_is_csa_in_progress(sdata)) return; if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; sdata->u.mgd.connection_loss = false; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.beacon_connection_loss_work); } static unsigned long ieee80211_latest_active_link_conn_timeout(struct ieee80211_sub_if_data *sdata) { unsigned long latest_timeout = jiffies; unsigned int link_id; struct sta_info *sta; guard(rcu)(); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (!sta) return 0; for (link_id = 0; link_id < ARRAY_SIZE(sta->link); link_id++) { struct link_sta_info *link_sta; unsigned long timeout; link_sta = rcu_dereference(sta->link[link_id]); if (!link_sta) continue; timeout = link_sta->status_stats.last_ack; if (time_before(timeout, link_sta->rx_stats.last_rx)) timeout = link_sta->rx_stats.last_rx; timeout += IEEE80211_CONNECTION_IDLE_TIME; /* * latest_timeout holds the timeout of the link * that will expire last among all links in an * non-AP MLD STA. This ensures that the connection * monitor timer is only reset if at least one link * is still active, and it is scheduled to fire at * the latest possible timeout. */ if (time_after(timeout, latest_timeout)) latest_timeout = timeout; } return latest_timeout; } static void ieee80211_sta_conn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = timer_container_of(sdata, t, u.mgd.conn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; unsigned long latest_timeout; if (ieee80211_is_csa_in_progress(sdata)) return; latest_timeout = ieee80211_latest_active_link_conn_timeout(sdata); /* * If latest timeout is after now, then update timer to fire at * the later date, but do not actually probe at this time. */ if (time_is_after_jiffies(latest_timeout)) { mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(latest_timeout)); return; } wiphy_work_queue(local->hw.wiphy, &sdata->u.mgd.monitor_work); } static void ieee80211_sta_monitor_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.monitor_work); ieee80211_mgd_probe_ap(sdata, false); } static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_STATION) { __ieee80211_stop_poll(sdata); /* let's probe the connection once */ if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.monitor_work); } } #ifdef CONFIG_PM void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifmgd->auth_data || ifmgd->assoc_data) { const u8 *ap_addr = ifmgd->auth_data ? ifmgd->auth_data->ap_addr : ifmgd->assoc_data->ap_addr; /* * If we are trying to authenticate / associate while suspending, * cfg80211 won't know and won't actually abort those attempts, * thus we need to do that ourselves. */ ieee80211_send_deauth_disassoc(sdata, ap_addr, ap_addr, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, false, frame_buf); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, false); cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN, false); } /* This is a bit of a hack - we should find a better and more generic * solution to this. Normally when suspending, cfg80211 will in fact * deauthenticate. However, it doesn't (and cannot) stop an ongoing * auth (not so important) or assoc (this is the problem) process. * * As a consequence, it can happen that we are in the process of both * associating and suspending, and receive an association response * after cfg80211 has checked if it needs to disconnect, but before * we actually set the flag to drop incoming frames. This will then * cause the workqueue flush to process the association response in * the suspend, resulting in a successful association just before it * tries to remove the interface from the driver, which now though * has a channel context assigned ... this results in issues. * * To work around this (for now) simply deauth here again if we're * now connected. */ if (ifmgd->associated && !sdata->local->wowlan) { u8 bssid[ETH_ALEN]; struct cfg80211_deauth_request req = { .reason_code = WLAN_REASON_DEAUTH_LEAVING, .bssid = bssid, }; memcpy(bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); ieee80211_mgd_deauth(sdata, &req); } } #endif void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ifmgd->associated) return; if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) { sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME; mlme_dbg(sdata, "driver requested disconnect after resume\n"); ieee80211_sta_connection_lost(sdata, WLAN_REASON_UNSPECIFIED, true); return; } if (sdata->flags & IEEE80211_SDATA_DISCONNECT_HW_RESTART) { sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_HW_RESTART; mlme_dbg(sdata, "driver requested disconnect after hardware restart\n"); ieee80211_sta_connection_lost(sdata, WLAN_REASON_UNSPECIFIED, true); return; } } static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.request_smps_work); __ieee80211_request_smps_mgd(link->sdata, link, link->u.mgd.driver_smps_mode); } static void ieee80211_ml_sta_reconf_timeout(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.reconf.wk.work); if (!sdata->u.mgd.reconf.added_links && !sdata->u.mgd.reconf.removed_links) return; sdata_info(sdata, "mlo: reconf: timeout: added=0x%x, removed=0x%x\n", sdata->u.mgd.reconf.added_links, sdata->u.mgd.reconf.removed_links); __ieee80211_disconnect(sdata); } /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; wiphy_work_init(&ifmgd->monitor_work, ieee80211_sta_monitor_work); wiphy_work_init(&ifmgd->beacon_connection_loss_work, ieee80211_beacon_connection_loss_work); wiphy_work_init(&ifmgd->csa_connection_drop_work, ieee80211_csa_connection_drop_work); wiphy_delayed_work_init(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); wiphy_hrtimer_work_init(&ifmgd->ml_reconf_work, ieee80211_ml_reconf_work); wiphy_delayed_work_init(&ifmgd->reconf.wk, ieee80211_ml_sta_reconf_timeout); timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0); timer_setup(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 0); timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); wiphy_delayed_work_init(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); wiphy_hrtimer_work_init(&ifmgd->ttlm_work, ieee80211_tid_to_link_map_work); wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, ieee80211_neg_ttlm_timeout_work); wiphy_work_init(&ifmgd->teardown_ttlm_work, ieee80211_teardown_ttlm_work); ifmgd->flags = 0; ifmgd->powersave = sdata->wdev.ps; ifmgd->uapsd_queues = sdata->local->hw.uapsd_queues; ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len; /* Setup TDLS data */ spin_lock_init(&ifmgd->teardown_lock); ifmgd->teardown_skb = NULL; ifmgd->orig_teardown_skb = NULL; ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; } static void ieee80211_recalc_smps_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.recalc_smps); ieee80211_recalc_smps(link->sdata, link); } void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; link->u.mgd.p2p_noa_index = -1; link->conf->bssid = link->u.mgd.bssid; link->smps_mode = IEEE80211_SMPS_OFF; wiphy_work_init(&link->u.mgd.request_smps_work, ieee80211_request_smps_mgd_work); wiphy_work_init(&link->u.mgd.recalc_smps, ieee80211_recalc_smps_work); if (local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) link->u.mgd.req_smps = IEEE80211_SMPS_AUTOMATIC; else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; wiphy_hrtimer_work_init(&link->u.mgd.csa.switch_work, ieee80211_csa_switch_work); ieee80211_clear_tpe(&link->conf->tpe); if (sdata->u.mgd.assoc_data) ether_addr_copy(link->conf->addr, sdata->u.mgd.assoc_data->link[link_id].addr); else if (sdata->u.mgd.reconf.add_links_data) ether_addr_copy(link->conf->addr, sdata->u.mgd.reconf.add_links_data->link[link_id].addr); else if (!is_valid_ether_addr(link->conf->addr)) eth_random_addr(link->conf->addr); } /* scan finished notification */ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; /* Restart STA timers */ rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) ieee80211_restart_sta_timer(sdata); } rcu_read_unlock(); } static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, s8 link_id, const u8 *ap_mld_addr, bool assoc, struct ieee80211_conn_settings *conn, bool override, unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_link_data *link; bool have_sta = false; bool mlo; int err; u16 new_links; if (link_id >= 0) { mlo = true; if (WARN_ON(!ap_mld_addr)) return -EINVAL; new_links = BIT(link_id); } else { if (WARN_ON(ap_mld_addr)) return -EINVAL; ap_mld_addr = cbss->bssid; new_links = 0; link_id = 0; mlo = false; } if (assoc) { rcu_read_lock(); have_sta = sta_info_get(sdata, ap_mld_addr); rcu_read_unlock(); } if (mlo && !have_sta && WARN_ON(sdata->vif.valid_links || sdata->vif.active_links)) return -EINVAL; err = ieee80211_vif_set_links(sdata, new_links, 0); if (err) return err; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) { err = -ENOLINK; goto out_err; } if (WARN_ON(!ifmgd->auth_data && !ifmgd->assoc_data)) { err = -EINVAL; goto out_err; } /* If a reconfig is happening, bail out */ if (local->in_reconfig) { err = -EBUSY; goto out_err; } if (!have_sta) { if (mlo) new_sta = sta_info_alloc_with_link(sdata, ap_mld_addr, link_id, cbss->bssid, GFP_KERNEL); else new_sta = sta_info_alloc(sdata, ap_mld_addr, GFP_KERNEL); if (!new_sta) { err = -ENOMEM; goto out_err; } new_sta->sta.mlo = mlo; } /* * Set up the information for the new channel before setting the * new channel. We can't - completely race-free - change the basic * rates bitmap and the channel (sband) that it refers to, but if * we set it up before we at least avoid calling into the driver's * bss_info_changed() method with invalid information (since we do * call that from changing the channel - only for IDLE and perhaps * some others, but ...). * * So to avoid that, just set up all the new information before the * channel, but tell the driver to apply it only afterwards, since * it might need the new channel for that. */ if (new_sta) { const struct cfg80211_bss_ies *ies; struct link_sta_info *link_sta; rcu_read_lock(); link_sta = rcu_dereference(new_sta->link[link_id]); if (WARN_ON(!link_sta)) { rcu_read_unlock(); sta_info_free(local, new_sta); err = -EINVAL; goto out_err; } err = ieee80211_mgd_setup_link_sta(link, new_sta, link_sta, cbss); if (err) { rcu_read_unlock(); sta_info_free(local, new_sta); goto out_err; } memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN); /* set timing information */ link->conf->beacon_int = cbss->beacon_interval; ies = rcu_dereference(cbss->beacon_ies); if (ies) { link->conf->sync_tsf = ies->tsf; link->conf->sync_device_ts = bss->device_ts_beacon; ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, NULL); } else if (!ieee80211_hw_check(&sdata->local->hw, TIMING_BEACON_ONLY)) { ies = rcu_dereference(cbss->proberesp_ies); /* must be non-NULL since beacon IEs were NULL */ link->conf->sync_tsf = ies->tsf; link->conf->sync_device_ts = bss->device_ts_presp; link->conf->sync_dtim_count = 0; } else { link->conf->sync_tsf = 0; link->conf->sync_device_ts = 0; link->conf->sync_dtim_count = 0; } rcu_read_unlock(); } if (new_sta || override) { /* * Only set this if we're also going to calculate the AP * settings etc., otherwise this was set before in a * previous call. Note override is set to %true in assoc * if the settings were changed. */ link->u.mgd.conn = *conn; err = ieee80211_prep_channel(sdata, link, link->link_id, cbss, mlo, &link->u.mgd.conn, userspace_selectors); if (err) { if (new_sta) sta_info_free(local, new_sta); goto out_err; } /* pass out for use in assoc */ *conn = link->u.mgd.conn; } if (new_sta) { /* * tell driver about BSSID, basic rates and timing * this was set up above, before setting the channel */ ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT); if (assoc) sta_info_pre_move_state(new_sta, IEEE80211_STA_AUTH); err = sta_info_insert(new_sta); new_sta = NULL; if (err) { sdata_info(sdata, "failed to insert STA entry for the AP (error %d)\n", err); goto out_release_chan; } } else WARN_ON_ONCE(!ether_addr_equal(link->u.mgd.bssid, cbss->bssid)); /* Cancel scan to ensure that nothing interferes with connection */ if (local->scanning) ieee80211_scan_cancel(local); return 0; out_release_chan: ieee80211_link_release_channel(link); out_err: ieee80211_vif_set_links(sdata, 0, 0); return err; } static bool ieee80211_mgd_csa_present(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies, u8 cur_channel, bool ignore_ecsa) { const struct element *csa_elem, *ecsa_elem; struct ieee80211_channel_sw_ie *csa = NULL; struct ieee80211_ext_chansw_ie *ecsa = NULL; if (!ies) return false; csa_elem = cfg80211_find_elem(WLAN_EID_CHANNEL_SWITCH, ies->data, ies->len); if (csa_elem && csa_elem->datalen == sizeof(*csa)) csa = (void *)csa_elem->data; ecsa_elem = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, ies->data, ies->len); if (ecsa_elem && ecsa_elem->datalen == sizeof(*ecsa)) ecsa = (void *)ecsa_elem->data; if (csa && csa->count == 0) csa = NULL; if (csa && !csa->mode && csa->new_ch_num == cur_channel) csa = NULL; if (ecsa && ecsa->count == 0) ecsa = NULL; if (ecsa && !ecsa->mode && ecsa->new_ch_num == cur_channel) ecsa = NULL; if (ignore_ecsa && ecsa) { sdata_info(sdata, "Ignoring ECSA in probe response - was considered stuck!\n"); return csa; } return csa || ecsa; } static bool ieee80211_mgd_csa_in_process(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *bss) { u8 cur_channel; bool ret; cur_channel = ieee80211_frequency_to_channel(bss->channel->center_freq); rcu_read_lock(); if (ieee80211_mgd_csa_present(sdata, rcu_dereference(bss->beacon_ies), cur_channel, false)) { ret = true; goto out; } if (ieee80211_mgd_csa_present(sdata, rcu_dereference(bss->proberesp_ies), cur_channel, bss->proberesp_ecsa_stuck)) { ret = true; goto out; } ret = false; out: rcu_read_unlock(); return ret; } static void ieee80211_parse_cfg_selectors(unsigned long *userspace_selectors, const u8 *supported_selectors, u8 supported_selectors_len) { if (supported_selectors) { for (int i = 0; i < supported_selectors_len; i++) { set_bit(supported_selectors[i], userspace_selectors); } } else { /* Assume SAE_H2E support for backward compatibility. */ set_bit(BSS_MEMBERSHIP_SELECTOR_SAE_H2E, userspace_selectors); } } /* config hooks */ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_conn_settings conn; struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; struct ieee80211_bss *bss; u16 auth_alg; int err; bool cont_auth, wmm_used; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* prepare auth data structure */ switch (req->auth_type) { case NL80211_AUTHTYPE_OPEN_SYSTEM: auth_alg = WLAN_AUTH_OPEN; break; case NL80211_AUTHTYPE_SHARED_KEY: if (fips_enabled) return -EOPNOTSUPP; auth_alg = WLAN_AUTH_SHARED_KEY; break; case NL80211_AUTHTYPE_FT: auth_alg = WLAN_AUTH_FT; break; case NL80211_AUTHTYPE_NETWORK_EAP: auth_alg = WLAN_AUTH_LEAP; break; case NL80211_AUTHTYPE_SAE: auth_alg = WLAN_AUTH_SAE; break; case NL80211_AUTHTYPE_FILS_SK: auth_alg = WLAN_AUTH_FILS_SK; break; case NL80211_AUTHTYPE_FILS_SK_PFS: auth_alg = WLAN_AUTH_FILS_SK_PFS; break; case NL80211_AUTHTYPE_FILS_PK: auth_alg = WLAN_AUTH_FILS_PK; break; default: return -EOPNOTSUPP; } if (ifmgd->assoc_data) return -EBUSY; if (ieee80211_mgd_csa_in_process(sdata, req->bss)) { sdata_info(sdata, "AP is in CSA process, reject auth\n"); return -EINVAL; } auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len + req->ie_len, GFP_KERNEL); if (!auth_data) return -ENOMEM; memcpy(auth_data->ap_addr, req->ap_mld_addr ?: req->bss->bssid, ETH_ALEN); auth_data->bss = req->bss; auth_data->link_id = req->link_id; if (req->auth_data_len >= 4) { if (req->auth_type == NL80211_AUTHTYPE_SAE) { __le16 *pos = (__le16 *) req->auth_data; auth_data->sae_trans = le16_to_cpu(pos[0]); auth_data->sae_status = le16_to_cpu(pos[1]); } memcpy(auth_data->data, req->auth_data + 4, req->auth_data_len - 4); auth_data->data_len += req->auth_data_len - 4; } /* Check if continuing authentication or trying to authenticate with the * same BSS that we were in the process of authenticating with and avoid * removal and re-addition of the STA entry in * ieee80211_prep_connection(). */ cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss && ifmgd->auth_data->link_id == req->link_id; if (req->ie && req->ie_len) { memcpy(&auth_data->data[auth_data->data_len], req->ie, req->ie_len); auth_data->data_len += req->ie_len; } if (req->key && req->key_len) { auth_data->key_len = req->key_len; auth_data->key_idx = req->key_idx; memcpy(auth_data->key, req->key, req->key_len); } ieee80211_parse_cfg_selectors(auth_data->userspace_selectors, req->supported_selectors, req->supported_selectors_len); auth_data->algorithm = auth_alg; /* try to authenticate/probe */ if (ifmgd->auth_data) { if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE) { auth_data->peer_confirmed = ifmgd->auth_data->peer_confirmed; } ieee80211_destroy_auth_data(sdata, cont_auth); } /* prep auth_data so we don't go into idle on disassoc */ ifmgd->auth_data = auth_data; /* If this is continuation of an ongoing SAE authentication exchange * (i.e., request to send SAE Confirm) and the peer has already * confirmed, mark authentication completed since we are about to send * out SAE Confirm. */ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE && auth_data->peer_confirmed && auth_data->sae_trans == 2) ieee80211_mark_sta_auth(sdata); if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; sdata_info(sdata, "disconnect from AP %pM for new auth to %pM\n", sdata->vif.cfg.ap_addr, auth_data->ap_addr); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_UNSPECIFIED, false); } /* needed for transmitting the auth frame(s) properly */ memcpy(sdata->vif.cfg.ap_addr, auth_data->ap_addr, ETH_ALEN); bss = (void *)req->bss->priv; wmm_used = bss->wmm_used && (local->hw.queues >= IEEE80211_NUM_ACS); sband = local->hw.wiphy->bands[req->bss->channel->band]; ieee80211_determine_our_sta_mode_auth(sdata, sband, req, wmm_used, &conn); err = ieee80211_prep_connection(sdata, req->bss, req->link_id, req->ap_mld_addr, cont_auth, &conn, false, auth_data->userspace_selectors); if (err) goto err_clear; if (req->link_id >= 0) link = sdata_dereference(sdata->link[req->link_id], sdata); else link = &sdata->deflink; if (WARN_ON(!link)) { err = -ENOLINK; goto err_clear; } sdata_info(sdata, "authenticate with %pM (local address=%pM)\n", auth_data->ap_addr, link->conf->addr); err = ieee80211_auth(sdata); if (err) { sta_info_destroy_addr(sdata, auth_data->ap_addr); goto err_clear; } /* hold our own reference */ cfg80211_ref_bss(local->hw.wiphy, auth_data->bss); return 0; err_clear: if (!ieee80211_vif_is_mld(&sdata->vif)) { eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); ieee80211_link_release_channel(&sdata->deflink); } ifmgd->auth_data = NULL; kfree(auth_data); return err; } static void ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, struct cfg80211_assoc_request *req, struct ieee80211_conn_settings *conn, unsigned int link_id) { struct ieee80211_local *local = sdata->local; const struct cfg80211_bss_ies *bss_ies; struct ieee80211_supported_band *sband; struct ieee80211_link_data *link; struct cfg80211_bss *cbss; struct ieee80211_bss *bss; cbss = assoc_data->link[link_id].bss; if (WARN_ON(!cbss)) return; bss = (void *)cbss->priv; sband = local->hw.wiphy->bands[cbss->channel->band]; if (WARN_ON(!sband)) return; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; /* for MLO connections assume advertising all rates is OK */ if (!req->ap_mld_addr) { assoc_data->supp_rates = bss->supp_rates; assoc_data->supp_rates_len = bss->supp_rates_len; } /* copy and link elems for the STA profile */ if (req->links[link_id].elems_len) { memcpy(assoc_data->ie_pos, req->links[link_id].elems, req->links[link_id].elems_len); assoc_data->link[link_id].elems = assoc_data->ie_pos; assoc_data->link[link_id].elems_len = req->links[link_id].elems_len; assoc_data->ie_pos += req->links[link_id].elems_len; } link->u.mgd.beacon_crc_valid = false; link->u.mgd.dtim_period = 0; link->u.mgd.have_beacon = false; /* override HT configuration only if the AP and we support it */ if (conn->mode >= IEEE80211_CONN_MODE_HT) { struct ieee80211_sta_ht_cap sta_ht_cap; memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); } rcu_read_lock(); bss_ies = rcu_dereference(cbss->beacon_ies); if (bss_ies) { u8 dtim_count = 0; ieee80211_get_dtim(bss_ies, &dtim_count, &link->u.mgd.dtim_period); sdata->deflink.u.mgd.have_beacon = true; if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { link->conf->sync_tsf = bss_ies->tsf; link->conf->sync_device_ts = bss->device_ts_beacon; link->conf->sync_dtim_count = dtim_count; } } else { bss_ies = rcu_dereference(cbss->ies); } if (bss_ies) { const struct element *elem; elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, bss_ies->data, bss_ies->len); if (elem && elem->datalen >= 3) link->conf->profile_periodicity = elem->data[2]; else link->conf->profile_periodicity = 0; elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, bss_ies->data, bss_ies->len); if (elem && elem->datalen >= 11 && (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) link->conf->ema_ap = true; else link->conf->ema_ap = false; } rcu_read_unlock(); if (bss->corrupt_data) { char *corrupt_type = "data"; if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_BEACON) { if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_PROBE_RESP) corrupt_type = "beacon and probe response"; else corrupt_type = "beacon"; } else if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_PROBE_RESP) { corrupt_type = "probe response"; } sdata_info(sdata, "associating to AP %pM with corrupt %s\n", cbss->bssid, corrupt_type); } if (link->u.mgd.req_smps == IEEE80211_SMPS_AUTOMATIC) { if (sdata->u.mgd.powersave) link->smps_mode = IEEE80211_SMPS_DYNAMIC; else link->smps_mode = IEEE80211_SMPS_OFF; } else { link->smps_mode = link->u.mgd.req_smps; } } static int ieee80211_mgd_get_ap_ht_vht_capa(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, int link_id) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; enum nl80211_band band = cbss->channel->band; struct ieee80211_supported_band *sband; const struct element *elem; int err; /* neither HT nor VHT elements used on 6 GHz */ if (band == NL80211_BAND_6GHZ) return 0; if (assoc_data->link[link_id].conn.mode < IEEE80211_CONN_MODE_HT) return 0; rcu_read_lock(); elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_OPERATION); if (!elem || elem->datalen < sizeof(struct ieee80211_ht_operation)) { mlme_link_id_dbg(sdata, link_id, "no HT operation on BSS %pM\n", cbss->bssid); err = -EINVAL; goto out_rcu; } assoc_data->link[link_id].ap_ht_param = ((struct ieee80211_ht_operation *)(elem->data))->ht_param; rcu_read_unlock(); if (assoc_data->link[link_id].conn.mode < IEEE80211_CONN_MODE_VHT) return 0; /* some drivers want to support VHT on 2.4 GHz even */ sband = sdata->local->hw.wiphy->bands[band]; if (!sband->vht_cap.vht_supported) return 0; rcu_read_lock(); elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); /* but even then accept it not being present on the AP */ if (!elem && band == NL80211_BAND_2GHZ) { err = 0; goto out_rcu; } if (!elem || elem->datalen < sizeof(struct ieee80211_vht_cap)) { mlme_link_id_dbg(sdata, link_id, "no VHT capa on BSS %pM\n", cbss->bssid); err = -EINVAL; goto out_rcu; } memcpy(&assoc_data->link[link_id].ap_vht_cap, elem->data, sizeof(struct ieee80211_vht_cap)); rcu_read_unlock(); return 0; out_rcu: rcu_read_unlock(); return err; } static bool ieee80211_mgd_assoc_bss_has_mld_ext_capa_ops(struct cfg80211_assoc_request *req) { const struct cfg80211_bss_ies *ies; struct cfg80211_bss *bss; const struct element *ml; /* not an MLO connection if link_id < 0, so irrelevant */ if (req->link_id < 0) return false; bss = req->links[req->link_id].bss; guard(rcu)(); ies = rcu_dereference(bss->ies); for_each_element_extid(ml, WLAN_EID_EXT_EHT_MULTI_LINK, ies->data, ies->len) { const struct ieee80211_multi_link_elem *mle; if (!ieee80211_mle_type_ok(ml->data + 1, IEEE80211_ML_CONTROL_TYPE_BASIC, ml->datalen - 1)) continue; mle = (void *)(ml->data + 1); if (mle->control & cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) return true; } return false; } int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { unsigned int assoc_link_id = req->link_id < 0 ? 0 : req->link_id; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data; const struct element *ssid_elem; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_link_data *link; struct cfg80211_bss *cbss; bool override, uapsd_supported; bool match_auth; int i, err; size_t size = sizeof(*assoc_data) + req->ie_len; for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) size += req->links[i].elems_len; /* FIXME: no support for 4-addr MLO yet */ if (sdata->u.mgd.use_4addr && req->link_id >= 0) return -EOPNOTSUPP; assoc_data = kzalloc(size, GFP_KERNEL); if (!assoc_data) return -ENOMEM; cbss = req->link_id < 0 ? req->bss : req->links[req->link_id].bss; if (ieee80211_mgd_csa_in_process(sdata, cbss)) { sdata_info(sdata, "AP is in CSA process, reject assoc\n"); err = -EINVAL; goto err_free; } rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); if (!ssid_elem || ssid_elem->datalen > sizeof(assoc_data->ssid)) { rcu_read_unlock(); err = -EINVAL; goto err_free; } memcpy(assoc_data->ssid, ssid_elem->data, ssid_elem->datalen); assoc_data->ssid_len = ssid_elem->datalen; rcu_read_unlock(); if (req->ap_mld_addr) memcpy(assoc_data->ap_addr, req->ap_mld_addr, ETH_ALEN); else memcpy(assoc_data->ap_addr, cbss->bssid, ETH_ALEN); /* * Many APs have broken parsing of the extended MLD capa/ops field, * dropping (re-)association request frames or replying with association * response with a failure status if it's present. * Set our value from the userspace request only in strict mode or if * the AP also had that field present. */ if (ieee80211_hw_check(&local->hw, STRICT) || ieee80211_mgd_assoc_bss_has_mld_ext_capa_ops(req)) assoc_data->ext_mld_capa_ops = cpu_to_le16(req->ext_mld_capa_ops); if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; sdata_info(sdata, "disconnect from AP %pM for new assoc to %pM\n", sdata->vif.cfg.ap_addr, assoc_data->ap_addr); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_UNSPECIFIED, false); } memset(sdata->u.mgd.userspace_selectors, 0, sizeof(sdata->u.mgd.userspace_selectors)); ieee80211_parse_cfg_selectors(sdata->u.mgd.userspace_selectors, req->supported_selectors, req->supported_selectors_len); memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa)); memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask, sizeof(ifmgd->ht_capa_mask)); memcpy(&ifmgd->vht_capa, &req->vht_capa, sizeof(ifmgd->vht_capa)); memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask, sizeof(ifmgd->vht_capa_mask)); memcpy(&ifmgd->s1g_capa, &req->s1g_capa, sizeof(ifmgd->s1g_capa)); memcpy(&ifmgd->s1g_capa_mask, &req->s1g_capa_mask, sizeof(ifmgd->s1g_capa_mask)); /* keep some setup (AP STA, channel, ...) if matching */ match_auth = ifmgd->auth_data && ether_addr_equal(ifmgd->auth_data->ap_addr, assoc_data->ap_addr) && ifmgd->auth_data->link_id == req->link_id; if (req->ap_mld_addr) { uapsd_supported = true; if (req->flags & (ASSOC_REQ_DISABLE_HT | ASSOC_REQ_DISABLE_VHT | ASSOC_REQ_DISABLE_HE | ASSOC_REQ_DISABLE_EHT)) { err = -EINVAL; goto err_free; } for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { struct ieee80211_supported_band *sband; struct cfg80211_bss *link_cbss = req->links[i].bss; struct ieee80211_bss *bss; if (!link_cbss) continue; bss = (void *)link_cbss->priv; if (!bss->wmm_used) { err = -EINVAL; req->links[i].error = err; goto err_free; } if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { err = -EINVAL; req->links[i].error = err; goto err_free; } link = sdata_dereference(sdata->link[i], sdata); if (link) ether_addr_copy(assoc_data->link[i].addr, link->conf->addr); else eth_random_addr(assoc_data->link[i].addr); sband = local->hw.wiphy->bands[link_cbss->channel->band]; if (match_auth && i == assoc_link_id && link) assoc_data->link[i].conn = link->u.mgd.conn; else assoc_data->link[i].conn = ieee80211_conn_settings_unlimited; ieee80211_determine_our_sta_mode_assoc(sdata, sband, req, true, i, &assoc_data->link[i].conn); assoc_data->link[i].bss = link_cbss; assoc_data->link[i].disabled = req->links[i].disabled; if (!bss->uapsd_supported) uapsd_supported = false; if (assoc_data->link[i].conn.mode < IEEE80211_CONN_MODE_EHT) { err = -EINVAL; req->links[i].error = err; goto err_free; } err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, assoc_data, i); if (err) { err = -EINVAL; req->links[i].error = err; goto err_free; } } assoc_data->wmm = true; } else { struct ieee80211_supported_band *sband; struct ieee80211_bss *bss = (void *)cbss->priv; memcpy(assoc_data->link[0].addr, sdata->vif.addr, ETH_ALEN); assoc_data->s1g = cbss->channel->band == NL80211_BAND_S1GHZ; assoc_data->wmm = bss->wmm_used && (local->hw.queues >= IEEE80211_NUM_ACS); if (cbss->channel->band == NL80211_BAND_6GHZ && req->flags & (ASSOC_REQ_DISABLE_HT | ASSOC_REQ_DISABLE_VHT | ASSOC_REQ_DISABLE_HE)) { err = -EINVAL; goto err_free; } sband = local->hw.wiphy->bands[cbss->channel->band]; assoc_data->link[0].bss = cbss; if (match_auth) assoc_data->link[0].conn = sdata->deflink.u.mgd.conn; else assoc_data->link[0].conn = ieee80211_conn_settings_unlimited; ieee80211_determine_our_sta_mode_assoc(sdata, sband, req, assoc_data->wmm, 0, &assoc_data->link[0].conn); uapsd_supported = bss->uapsd_supported; err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, assoc_data, 0); if (err) goto err_free; } assoc_data->spp_amsdu = req->flags & ASSOC_REQ_SPP_AMSDU; if (ifmgd->auth_data && !ifmgd->auth_data->done) { err = -EBUSY; goto err_free; } if (ifmgd->assoc_data) { err = -EBUSY; goto err_free; } /* Cleanup is delayed if auth_data matches */ if (ifmgd->auth_data && !match_auth) ieee80211_destroy_auth_data(sdata, false); if (req->ie && req->ie_len) { memcpy(assoc_data->ie, req->ie, req->ie_len); assoc_data->ie_len = req->ie_len; assoc_data->ie_pos = assoc_data->ie + assoc_data->ie_len; } else { assoc_data->ie_pos = assoc_data->ie; } if (req->fils_kek) { /* should already be checked in cfg80211 - so warn */ if (WARN_ON(req->fils_kek_len > FILS_MAX_KEK_LEN)) { err = -EINVAL; goto err_free; } memcpy(assoc_data->fils_kek, req->fils_kek, req->fils_kek_len); assoc_data->fils_kek_len = req->fils_kek_len; } if (req->fils_nonces) memcpy(assoc_data->fils_nonces, req->fils_nonces, 2 * FILS_NONCE_LEN); /* default timeout */ assoc_data->timeout = jiffies; assoc_data->timeout_started = true; assoc_data->assoc_link_id = assoc_link_id; if (req->ap_mld_addr) { /* if there was no authentication, set up the link */ err = ieee80211_vif_set_links(sdata, BIT(assoc_link_id), 0); if (err) goto err_clear; } link = sdata_dereference(sdata->link[assoc_link_id], sdata); if (WARN_ON(!link)) { err = -EINVAL; goto err_clear; } override = link->u.mgd.conn.mode != assoc_data->link[assoc_link_id].conn.mode || link->u.mgd.conn.bw_limit != assoc_data->link[assoc_link_id].conn.bw_limit; link->u.mgd.conn = assoc_data->link[assoc_link_id].conn; ieee80211_setup_assoc_link(sdata, assoc_data, req, &link->u.mgd.conn, assoc_link_id); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK), "U-APSD not supported with HW_PS_NULLFUNC_STACK\n")) sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD; if (assoc_data->wmm && uapsd_supported && (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD)) { assoc_data->uapsd = true; ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; } else { assoc_data->uapsd = false; ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED; } if (req->prev_bssid) memcpy(assoc_data->prev_ap_addr, req->prev_bssid, ETH_ALEN); if (req->use_mfp) { ifmgd->mfp = IEEE80211_MFP_REQUIRED; ifmgd->flags |= IEEE80211_STA_MFP_ENABLED; } else { ifmgd->mfp = IEEE80211_MFP_DISABLED; ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED; } if (req->flags & ASSOC_REQ_USE_RRM) ifmgd->flags |= IEEE80211_STA_ENABLE_RRM; else ifmgd->flags &= ~IEEE80211_STA_ENABLE_RRM; if (req->crypto.control_port) ifmgd->flags |= IEEE80211_STA_CONTROL_PORT; else ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT; sdata->control_port_protocol = req->crypto.control_port_ethertype; sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = req->crypto.control_port_over_nl80211; sdata->control_port_no_preauth = req->crypto.control_port_no_preauth; /* kick off associate process */ ifmgd->assoc_data = assoc_data; for (i = 0; i < ARRAY_SIZE(assoc_data->link); i++) { if (!assoc_data->link[i].bss) continue; if (i == assoc_data->assoc_link_id) continue; /* only calculate the mode, hence link == NULL */ err = ieee80211_prep_channel(sdata, NULL, i, assoc_data->link[i].bss, true, &assoc_data->link[i].conn, sdata->u.mgd.userspace_selectors); if (err) { req->links[i].error = err; goto err_clear; } } memcpy(vif_cfg->ssid, assoc_data->ssid, assoc_data->ssid_len); vif_cfg->ssid_len = assoc_data->ssid_len; /* needed for transmitting the assoc frames properly */ memcpy(sdata->vif.cfg.ap_addr, assoc_data->ap_addr, ETH_ALEN); err = ieee80211_prep_connection(sdata, cbss, req->link_id, req->ap_mld_addr, true, &assoc_data->link[assoc_link_id].conn, override, sdata->u.mgd.userspace_selectors); if (err) goto err_clear; if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC)) { const struct cfg80211_bss_ies *beacon_ies; rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); if (!beacon_ies) { /* * Wait up to one beacon interval ... * should this be more if we miss one? */ sdata_info(sdata, "waiting for beacon from %pM\n", link->u.mgd.bssid); assoc_data->timeout = TU_TO_EXP_TIME(req->bss->beacon_interval); assoc_data->timeout_started = true; assoc_data->need_beacon = true; } rcu_read_unlock(); } run_again(sdata, assoc_data->timeout); /* We are associating, clean up auth_data */ if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, true); return 0; err_clear: if (!ifmgd->auth_data) { eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); } ifmgd->assoc_data = NULL; err_free: kfree(assoc_data); return err; } int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct cfg80211_deauth_request *req) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; bool tx = !req->local_state_change; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_DEAUTH, }; if (ifmgd->auth_data && ether_addr_equal(ifmgd->auth_data->ap_addr, req->bssid)) { sdata_info(sdata, "aborting authentication with %pM by local choice (Reason: %u=%s)\n", req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); info.link_id = ifmgd->auth_data->link_id; drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_destroy_auth_data(sdata, false); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } if (ifmgd->assoc_data && ether_addr_equal(ifmgd->assoc_data->ap_addr, req->bssid)) { sdata_info(sdata, "aborting association with %pM by local choice (Reason: %u=%s)\n", req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); info.link_id = ifmgd->assoc_data->assoc_link_id; drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } if (ifmgd->associated && ether_addr_equal(sdata->vif.cfg.ap_addr, req->bssid)) { sdata_info(sdata, "deauthenticating from %pM by local choice (Reason: %u=%s)\n", req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); return 0; } return -ENOTCONN; } int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_disassoc_request *req) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; if (!sdata->u.mgd.associated || memcmp(sdata->vif.cfg.ap_addr, req->ap_addr, ETH_ALEN)) return -ENOTCONN; sdata_info(sdata, "disassociating from %pM by local choice (Reason: %u=%s)\n", req->ap_addr, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DISASSOC, req->reason_code, !req->local_state_change, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); return 0; } void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) { wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.request_smps_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work); } void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* * Make sure some work items will not run after this, * they will not do anything but might not have been * cancelled when disconnecting. */ wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->monitor_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->beacon_connection_loss_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->tdls_peer_del_work); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, false); spin_lock_bh(&ifmgd->teardown_lock); if (ifmgd->teardown_skb) { kfree_skb(ifmgd->teardown_skb); ifmgd->teardown_skb = NULL; ifmgd->orig_teardown_skb = NULL; } kfree(ifmgd->assoc_req_ies); ifmgd->assoc_req_ies = NULL; ifmgd->assoc_req_ies_len = 0; spin_unlock_bh(&ifmgd->teardown_lock); timer_delete_sync(&ifmgd->timer); } void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level); cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp); } EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_cqm_beacon_loss_notify(sdata->local, sdata); cfg80211_cqm_beacon_loss_notify(sdata->dev, gfp); } EXPORT_SYMBOL(ieee80211_cqm_beacon_loss_notify); static void _ieee80211_enable_rssi_reports(struct ieee80211_sub_if_data *sdata, int rssi_min_thold, int rssi_max_thold) { trace_api_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; /* * Scale up threshold values before storing it, as the RSSI averaging * algorithm uses a scaled up value as well. Change this scaling * factor if the RSSI averaging algorithm changes. */ sdata->u.mgd.rssi_min_thold = rssi_min_thold*16; sdata->u.mgd.rssi_max_thold = rssi_max_thold*16; } void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); WARN_ON(rssi_min_thold == rssi_max_thold || rssi_min_thold > rssi_max_thold); _ieee80211_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold); } EXPORT_SYMBOL(ieee80211_enable_rssi_reports); void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); _ieee80211_enable_rssi_reports(sdata, 0, 0); } EXPORT_SYMBOL(ieee80211_disable_rssi_reports); void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *add_links_data = ifmgd->reconf.add_links_data; struct sta_info *sta; struct cfg80211_mlo_reconf_done_data done_data = {}; u16 sta_changed_links = sdata->u.mgd.reconf.added_links | sdata->u.mgd.reconf.removed_links; u16 link_mask, valid_links; unsigned int link_id; size_t orig_len = len; u8 i, group_key_data_len; u8 *pos; if (!ieee80211_vif_is_mld(&sdata->vif) || len < offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp) || mgmt->u.action.u.ml_reconf_resp.dialog_token != sdata->u.mgd.reconf.dialog_token || !sta_changed_links) return; pos = mgmt->u.action.u.ml_reconf_resp.variable; len -= offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp); /* each status duple is 3 octets */ if (len < mgmt->u.action.u.ml_reconf_resp.count * 3) { sdata_info(sdata, "mlo: reconf: unexpected len=%zu, count=%u\n", len, mgmt->u.action.u.ml_reconf_resp.count); goto disconnect; } link_mask = sta_changed_links; for (i = 0; i < mgmt->u.action.u.ml_reconf_resp.count; i++) { u16 status = get_unaligned_le16(pos + 1); link_id = *pos; if (!(link_mask & BIT(link_id))) { sdata_info(sdata, "mlo: reconf: unexpected link: %u, changed=0x%x\n", link_id, sta_changed_links); goto disconnect; } /* clear the corresponding link, to detect the case that * the same link was included more than one time */ link_mask &= ~BIT(link_id); /* Handle failure to remove links here. Failure to remove added * links will be done later in the flow. */ if (status != WLAN_STATUS_SUCCESS) { sdata_info(sdata, "mlo: reconf: failed on link=%u, status=%u\n", link_id, status); /* The AP MLD failed to remove a link that was already * removed locally. As this is not expected behavior, * disconnect */ if (sdata->u.mgd.reconf.removed_links & BIT(link_id)) goto disconnect; /* The AP MLD failed to add a link. Remove it from the * added links. */ sdata->u.mgd.reconf.added_links &= ~BIT(link_id); } pos += 3; len -= 3; } if (link_mask) { sdata_info(sdata, "mlo: reconf: no response for links=0x%x\n", link_mask); goto disconnect; } if (!sdata->u.mgd.reconf.added_links) goto out; if (len < 1 || len < 1 + *pos) { sdata_info(sdata, "mlo: reconf: invalid group key data length"); goto disconnect; } /* The Group Key Data field must be present when links are added. This * field should be processed by userland. */ group_key_data_len = *pos++; pos += group_key_data_len; len -= group_key_data_len + 1; /* Process the information for the added links */ sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) goto disconnect; valid_links = sdata->vif.valid_links; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!add_links_data->link[link_id].bss || !(sdata->u.mgd.reconf.added_links & BIT(link_id))) continue; valid_links |= BIT(link_id); if (ieee80211_sta_allocate_link(sta, link_id)) goto disconnect; } ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links); link_mask = 0; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = add_links_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; u64 changed = 0; if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) goto disconnect; link_info(link, "mlo: reconf: local address %pM, AP link address %pM\n", add_links_data->link[link_id].addr, add_links_data->link[link_id].bss->bssid); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) goto disconnect; if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->beacon_ies); if (ies) link->u.mgd.have_beacon = true; else ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); } link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; link->u.mgd.conn = add_links_data->link[link_id].conn; if (ieee80211_prep_channel(sdata, link, link_id, cbss, true, &link->u.mgd.conn, sdata->u.mgd.userspace_selectors)) { link_info(link, "mlo: reconf: prep_channel failed\n"); goto disconnect; } if (ieee80211_mgd_setup_link_sta(link, sta, link_sta, add_links_data->link[link_id].bss)) goto disconnect; if (!ieee80211_assoc_config_link(link, link_sta, add_links_data->link[link_id].bss, mgmt, pos, len, &changed)) goto disconnect; /* The AP MLD indicated success for this link, but the station * profile status indicated otherwise. Since there is an * inconsistency in the ML reconfiguration response, disconnect */ if (add_links_data->link[link_id].status != WLAN_STATUS_SUCCESS) goto disconnect; ieee80211_sta_init_nss(link_sta); if (ieee80211_sta_activate_link(sta, link_id)) goto disconnect; changed |= ieee80211_link_set_associated(link, cbss); ieee80211_link_info_change_notify(sdata, link, changed); ieee80211_recalc_smps(sdata, link); link_mask |= BIT(link_id); } sdata_info(sdata, "mlo: reconf: current valid_links=0x%x, added=0x%x\n", valid_links, link_mask); /* links might have changed due to rejected ones, set them again */ ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); done_data.buf = (const u8 *)mgmt; done_data.len = orig_len; done_data.added_links = link_mask; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { done_data.links[link_id].bss = add_links_data->link[link_id].bss; done_data.links[link_id].addr = add_links_data->link[link_id].addr; } cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); kfree(sdata->u.mgd.reconf.add_links_data); sdata->u.mgd.reconf.add_links_data = NULL; out: ieee80211_ml_reconf_reset(sdata); return; disconnect: __ieee80211_disconnect(sdata); } static struct sk_buff * ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *add_links_data, u16 removed_links, __le16 ext_mld_capa_ops) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct ieee80211_multi_link_elem *ml_elem; struct ieee80211_mle_basic_common_info *common; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct sk_buff *skb; size_t size; unsigned int link_id; __le16 eml_capa = 0, mld_capa_ops = 0; struct ieee80211_tx_info *info; u8 common_size, var_common_size; u8 *ml_elem_len; u16 capab = 0; size = local->hw.extra_tx_headroom + sizeof(*mgmt); /* Consider the maximal length of the reconfiguration ML element */ size += sizeof(struct ieee80211_multi_link_elem); /* The Basic ML element and the Reconfiguration ML element have the same * fixed common information fields in the context of ML reconfiguration * action frame. The AP MLD MAC address must always be present */ common_size = sizeof(*common); /* when adding links, the MLD capabilities must be present */ var_common_size = 0; if (add_links_data) { const struct wiphy_iftype_ext_capab *ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, ieee80211_vif_type_p2p(&sdata->vif)); if (ift_ext_capa) { eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities); mld_capa_ops = cpu_to_le16(ift_ext_capa->mld_capa_and_ops); } /* MLD capabilities and operation */ var_common_size += 2; /* EML capabilities */ if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) var_common_size += 2; } if (ext_mld_capa_ops) var_common_size += 2; /* Add the common information length */ size += common_size + var_common_size; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss; size_t elems_len; if (removed_links & BIT(link_id)) { size += sizeof(struct ieee80211_mle_per_sta_profile) + ETH_ALEN; continue; } if (!add_links_data || !add_links_data->link[link_id].bss) continue; elems_len = add_links_data->link[link_id].elems_len; cbss = add_links_data->link[link_id].bss; /* should be the same across all BSSes */ if (cbss->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; size += 2 + sizeof(struct ieee80211_mle_per_sta_profile) + ETH_ALEN; /* WMM */ size += 9; size += ieee80211_link_common_elems_size(sdata, iftype, cbss, elems_len); } skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, offsetofend(struct ieee80211_mgmt, u.action.u.ml_reconf_req)); /* Add the MAC header */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); /* Add the action frame fixed fields */ mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ml_reconf_req.action_code = WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; /* allocate a dialog token and store it */ sdata->u.mgd.reconf.dialog_token = ++sdata->u.mgd.dialog_token_alloc; mgmt->u.action.u.ml_reconf_req.dialog_token = sdata->u.mgd.reconf.dialog_token; /* Add the ML reconfiguration element and the common information */ skb_put_u8(skb, WLAN_EID_EXTENSION); ml_elem_len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK); ml_elem = skb_put(skb, sizeof(*ml_elem)); ml_elem->control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_RECONF | IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR); common = skb_put(skb, common_size); common->len = common_size + var_common_size; memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN); if (add_links_data) { if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EML_CAPA); skb_put_data(skb, &eml_capa, sizeof(eml_capa)); } ml_elem->control |= cpu_to_le16(IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP); skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); } if (ext_mld_capa_ops) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP); skb_put_data(skb, &ext_mld_capa_ops, sizeof(ext_mld_capa_ops)); } if (sdata->u.mgd.flags & IEEE80211_STA_ENABLE_RRM) capab |= WLAN_CAPABILITY_RADIO_MEASURE; /* Add the per station profile */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { u8 *subelem_len = NULL; u16 ctrl; const u8 *addr; /* Skip links that are not changing */ if (!(removed_links & BIT(link_id)) && (!add_links_data || !add_links_data->link[link_id].bss)) continue; ctrl = link_id | IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT; if (removed_links & BIT(link_id)) { struct ieee80211_bss_conf *conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); if (!conf) continue; addr = conf->addr; ctrl |= u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK, IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE); } else { addr = add_links_data->link[link_id].addr; ctrl |= IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE | u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK, IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE); } skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); subelem_len = skb_put(skb, 1); put_unaligned_le16(ctrl, skb_put(skb, sizeof(ctrl))); skb_put_u8(skb, 1 + ETH_ALEN); skb_put_data(skb, addr, ETH_ALEN); if (!(removed_links & BIT(link_id))) { u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; size_t extra_used; void *capab_pos; u8 qos_info; capab_pos = skb_put(skb, 2); extra_used = ieee80211_add_link_elems(sdata, skb, &capab, NULL, add_links_data->link[link_id].elems, add_links_data->link[link_id].elems_len, link_id, NULL, link_present_elems, add_links_data); if (add_links_data->link[link_id].elems) skb_put_data(skb, add_links_data->link[link_id].elems + extra_used, add_links_data->link[link_id].elems_len - extra_used); if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED) { qos_info = sdata->u.mgd.uapsd_queues; qos_info |= (sdata->u.mgd.uapsd_max_sp_len << IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); } else { qos_info = 0; } ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); put_unaligned_le16(capab, capab_pos); } ieee80211_fragment_element(skb, subelem_len, IEEE80211_MLE_SUBELEM_FRAGMENT); } ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; return skb; } int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, struct cfg80211_ml_reconf_req *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_assoc_data *data = NULL; struct sta_info *sta; struct sk_buff *skb; u16 added_links, new_valid_links; int link_id, err; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->vif.cfg.mld_capa_op & IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT)) return -EINVAL; /* No support for concurrent ML reconfiguration operation */ if (sdata->u.mgd.reconf.added_links || sdata->u.mgd.reconf.removed_links) return -EBUSY; added_links = 0; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!req->add_links[link_id].bss) continue; added_links |= BIT(link_id); } sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) return -ENOLINK; /* Adding links to the set of valid link is done only after a successful * ML reconfiguration frame exchange. Here prepare the data for the ML * reconfiguration frame construction and allocate the required * resources */ if (added_links) { bool uapsd_supported; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->assoc_link_id = -1; data->wmm = true; uapsd_supported = true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_supported_band *sband; struct cfg80211_bss *link_cbss = req->add_links[link_id].bss; struct ieee80211_bss *bss; if (!link_cbss) continue; bss = (void *)link_cbss->priv; if (!bss->wmm_used) { err = -EINVAL; goto err_free; } if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { err = -EINVAL; goto err_free; } eth_random_addr(data->link[link_id].addr); data->link[link_id].conn = ieee80211_conn_settings_unlimited; sband = local->hw.wiphy->bands[link_cbss->channel->band]; ieee80211_determine_our_sta_mode(sdata, sband, NULL, true, link_id, &data->link[link_id].conn); data->link[link_id].bss = link_cbss; data->link[link_id].disabled = req->add_links[link_id].disabled; data->link[link_id].elems = (u8 *)req->add_links[link_id].elems; data->link[link_id].elems_len = req->add_links[link_id].elems_len; if (!bss->uapsd_supported) uapsd_supported = false; if (data->link[link_id].conn.mode < IEEE80211_CONN_MODE_EHT) { err = -EINVAL; goto err_free; } err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, data, link_id); if (err) { err = -EINVAL; goto err_free; } } /* Require U-APSD support if we enabled it */ if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED && !uapsd_supported) { err = -EINVAL; sdata_info(sdata, "U-APSD on but not available on (all) new links\n"); goto err_free; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!data->link[link_id].bss) continue; /* only used to verify the mode, nothing is allocated */ err = ieee80211_prep_channel(sdata, NULL, link_id, data->link[link_id].bss, true, &data->link[link_id].conn, sdata->u.mgd.userspace_selectors); if (err) goto err_free; } } /* link removal is done before the ML reconfiguration frame exchange so * that these links will not be used between their removal by the AP MLD * and before the station got the ML reconfiguration response. Based on * Section 35.3.6.4 in Draft P802.11be_D7.0 the AP MLD should accept the * link removal request. */ if (req->rem_links) { u16 new_active_links = sdata->vif.active_links & ~req->rem_links; new_valid_links = sdata->vif.valid_links & ~req->rem_links; /* Should not be left with no valid links to perform the * ML reconfiguration */ if (!new_valid_links || !(new_valid_links & ~sdata->vif.dormant_links)) { sdata_info(sdata, "mlo: reconf: no valid links\n"); err = -EINVAL; goto err_free; } if (new_active_links != sdata->vif.active_links) { if (!new_active_links) new_active_links = BIT(__ffs(new_valid_links & ~sdata->vif.dormant_links)); err = ieee80211_set_active_links(&sdata->vif, new_active_links); if (err) { sdata_info(sdata, "mlo: reconf: failed set active links\n"); goto err_free; } } } /* Build the SKB before the link removal as the construction of the * station info for removed links requires the local address. * Invalidate the removed links, so that the transmission of the ML * reconfiguration request frame would not be done using them, as the AP * is expected to send the ML reconfiguration response frame on the link * on which the request was received. */ skb = ieee80211_build_ml_reconf_req(sdata, data, req->rem_links, cpu_to_le16(req->ext_mld_capa_ops)); if (!skb) { err = -ENOMEM; goto err_free; } if (req->rem_links) { u16 new_dormant_links = sdata->vif.dormant_links & ~req->rem_links; err = ieee80211_vif_set_links(sdata, new_valid_links, new_dormant_links); if (err) { sdata_info(sdata, "mlo: reconf: failed set valid links\n"); kfree_skb(skb); goto err_free; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!(req->rem_links & BIT(link_id))) continue; ieee80211_sta_remove_link(sta, link_id); } /* notify the driver and upper layers */ ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); cfg80211_links_removed(sdata->dev, req->rem_links); } sdata_info(sdata, "mlo: reconf: adding=0x%x, removed=0x%x\n", added_links, req->rem_links); ieee80211_tx_skb(sdata, skb); sdata->u.mgd.reconf.added_links = added_links; sdata->u.mgd.reconf.add_links_data = data; sdata->u.mgd.reconf.removed_links = req->rem_links; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.reconf.wk, IEEE80211_ASSOC_TIMEOUT_SHORT); return 0; err_free: kfree(data); return err; } static bool ieee80211_mgd_epcs_supp(struct ieee80211_sub_if_data *sdata) { unsigned long valid_links = sdata->vif.valid_links; u8 link_id; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ieee80211_vif_is_mld(&sdata->vif)) return false; for_each_set_bit(link_id, &valid_links, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *bss_conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); if (WARN_ON(!bss_conf) || !bss_conf->epcs_support) return false; } return true; } int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int frame_len = offsetofend(struct ieee80211_mgmt, u.action.u.epcs) + (enable ? 1 : 0); if (!ieee80211_mgd_epcs_supp(sdata)) return -EINVAL; if (sdata->u.mgd.epcs.enabled == enable && !sdata->u.mgd.epcs.dialog_token) return 0; /* Do not allow enabling EPCS if the AP didn't respond yet. * However, allow disabling EPCS in such a case. */ if (sdata->u.mgd.epcs.dialog_token && enable) return -EALREADY; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); if (!skb) return -ENOBUFS; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, frame_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; if (enable) { u8 *pos = mgmt->u.action.u.epcs.variable; mgmt->u.action.u.epcs.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ; *pos = ++sdata->u.mgd.dialog_token_alloc; sdata->u.mgd.epcs.dialog_token = *pos; } else { mgmt->u.action.u.epcs.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN; ieee80211_epcs_teardown(sdata); ieee80211_epcs_changed(sdata, false); } ieee80211_tx_skb(sdata, skb); return 0; } static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { const struct element *sub; size_t scratch_len = elems->ml_epcs_len; u8 *scratch __free(kfree) = kzalloc(scratch_len, GFP_KERNEL); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_epcs) return; if (WARN_ON(!scratch)) return; /* Directly parse the sub elements as the common information doesn't * hold any useful information. */ for_each_mle_subelement(sub, (const u8 *)elems->ml_epcs, elems->ml_epcs_len) { struct ieee802_11_elems *link_elems __free(kfree) = NULL; struct ieee80211_link_data *link; u8 *pos = (void *)sub->data; u16 control; ssize_t len; u8 link_id; if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) continue; if (sub->datalen < sizeof(control)) break; control = get_unaligned_le16(pos); link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; len = cfg80211_defragment_element(sub, (u8 *)elems->ml_epcs, elems->ml_epcs_len, scratch, scratch_len, IEEE80211_MLE_SUBELEM_FRAGMENT); if (len < (ssize_t)sizeof(control)) continue; pos = scratch + sizeof(control); len -= sizeof(control); link_elems = ieee802_11_parse_elems(pos, len, false, NULL); if (!link_elems) continue; if (ieee80211_sta_wmm_params(sdata->local, link, link_elems->wmm_param, link_elems->wmm_param_len, link_elems->mu_edca_param_set)) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee802_11_elems *elems __free(kfree) = NULL; size_t ies_len; u16 status_code; u8 *pos, dialog_token; if (!ieee80211_mgd_epcs_supp(sdata)) return; /* Handle dialog token and status code */ pos = mgmt->u.action.u.epcs.variable; dialog_token = *pos; status_code = get_unaligned_le16(pos + 1); /* An EPCS enable response with dialog token == 0 is an unsolicited * notification from the AP MLD. In such a case, EPCS should already be * enabled and status must be success */ if (!dialog_token && (!sdata->u.mgd.epcs.enabled || status_code != WLAN_STATUS_SUCCESS)) return; if (sdata->u.mgd.epcs.dialog_token != dialog_token) return; sdata->u.mgd.epcs.dialog_token = 0; if (status_code != WLAN_STATUS_SUCCESS) return; pos += IEEE80211_EPCS_ENA_RESP_BODY_LEN; ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.epcs.variable) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); if (!elems) return; ieee80211_ml_epcs(sdata, elems); ieee80211_epcs_changed(sdata, true); } void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || !sdata->u.mgd.epcs.enabled) return; ieee80211_epcs_teardown(sdata); ieee80211_epcs_changed(sdata, false); }
4 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0-only /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/kernel.h> #include <linux/gfp.h> #include <net/genetlink.h> #include <linux/nl802154.h> #include "ieee802154.h" static unsigned int ieee802154_seq_num; static DEFINE_SPINLOCK(ieee802154_seq_lock); /* Requests to userspace */ struct sk_buff *ieee802154_nl_create(int flags, u8 req) { void *hdr; struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); unsigned long f; if (!msg) return NULL; spin_lock_irqsave(&ieee802154_seq_lock, f); hdr = genlmsg_put(msg, 0, ieee802154_seq_num++, &nl802154_family, flags, req); spin_unlock_irqrestore(&ieee802154_seq_lock, f); if (!hdr) { nlmsg_free(msg); return NULL; } return msg; } int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group) { struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); genlmsg_end(msg, hdr); return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC); } struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, int flags, u8 req) { void *hdr; struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return NULL; hdr = genlmsg_put_reply(msg, info, &nl802154_family, flags, req); if (!hdr) { nlmsg_free(msg); return NULL; } return msg; } int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) { struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); } static const struct genl_small_ops ieee802154_ops[] = { /* see nl-phy.c */ IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy, ieee802154_dump_phy), IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface), IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface), /* see nl-mac.c */ IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req), IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp), IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req), IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req), IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req), IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface, ieee802154_dump_iface), IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams), IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams), IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL, ieee802154_llsec_dump_keys), IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key), IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL, ieee802154_llsec_dump_devs), IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev), IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL, ieee802154_llsec_dump_devkeys), IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey), IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL, ieee802154_llsec_dump_seclevels), IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL, ieee802154_llsec_add_seclevel), IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL, ieee802154_llsec_del_seclevel), }; static const struct genl_multicast_group ieee802154_mcgrps[] = { [IEEE802154_COORD_MCGRP] = { .name = IEEE802154_MCAST_COORD_NAME, }, [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, }, }; struct genl_family nl802154_family __ro_after_init = { .hdrsize = 0, .name = IEEE802154_NL_NAME, .version = 1, .maxattr = IEEE802154_ATTR_MAX, .policy = ieee802154_policy, .module = THIS_MODULE, .small_ops = ieee802154_ops, .n_small_ops = ARRAY_SIZE(ieee802154_ops), .resv_start_op = IEEE802154_LLSEC_DEL_SECLEVEL + 1, .mcgrps = ieee802154_mcgrps, .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps), }; int __init ieee802154_nl_init(void) { return genl_register_family(&nl802154_family); } void ieee802154_nl_exit(void) { genl_unregister_family(&nl802154_family); }
90 14 1342 671 454 467 466 255 255 4202 202 248 492 72 137 137 137 37 37 37 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0b??1 = Root * 0b?00 = 16 bit nodes * 0b010 = 32 bit nodes * 0b110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-6 * 0b010 : 32 bit values, type in 0-2, slot in 3-6 * 0b110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; /* end of data */ unsigned char gap; /* offset of largest gap */ }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; #ifdef CONFIG_LOCKDEP struct lockdep_map *ma_external_lock; #endif }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ struct maple_node *alloc; /* A single allocated node for fast path writes */ unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ unsigned char vacant_height; /* Height of lowest node with free space */ unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) /* * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs */ #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .sheaf = NULL, \ .alloc = NULL, \ .node_request = 0, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ .vacant_height = 0, \ .sufficient_height = 0 \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */
3655 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* * Authors: Thiébaud Weksteen <tweek@google.com> * Peter Enderborg <Peter.Enderborg@sony.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM avc #if !defined(_TRACE_SELINUX_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SELINUX_H #include <linux/tracepoint.h> TRACE_EVENT(selinux_audited, TP_PROTO(struct selinux_audit_data *sad, char *scontext, char *tcontext, const char *tclass ), TP_ARGS(sad, scontext, tcontext, tclass), TP_STRUCT__entry( __field(u32, requested) __field(u32, denied) __field(u32, audited) __field(int, result) __string(scontext, scontext) __string(tcontext, tcontext) __string(tclass, tclass) ), TP_fast_assign( __entry->requested = sad->requested; __entry->denied = sad->denied; __entry->audited = sad->audited; __entry->result = sad->result; __assign_str(tcontext); __assign_str(scontext); __assign_str(tclass); ), TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s", __entry->requested, __entry->denied, __entry->audited, __entry->result, __get_str(scontext), __get_str(tcontext), __get_str(tclass) ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
4 4 1 4 4 2 2 22 22 22 22 22 19 20 20 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2015 Intel Deutschland GmbH * Copyright (C) 2021-2023 Intel Corporation */ #include <linux/kobject.h> #include <linux/slab.h> #include "ieee80211_i.h" #include "key.h" #include "debugfs.h" #include "debugfs_key.h" #define KEY_READ(name, prop, format_string) \ static ssize_t key_##name##_read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_key *key = file->private_data; \ return mac80211_format_buffer(userbuf, count, ppos, \ format_string, key->prop); \ } #define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n") #define KEY_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_OPS_W(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .write = key_##name##_write, \ .llseek = generic_file_llseek, \ } #define KEY_FILE(name, format) \ KEY_READ_##format(name) \ KEY_OPS(name) #define KEY_CONF_READ(name, format_string) \ KEY_READ(conf_##name, conf.name, format_string) #define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n") #define KEY_CONF_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_conf_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_CONF_FILE(name, format) \ KEY_CONF_READ_##format(name) \ KEY_CONF_OPS(name) KEY_CONF_FILE(keylen, D); KEY_CONF_FILE(keyidx, D); KEY_CONF_FILE(hw_key_idx, D); KEY_FILE(flags, X); KEY_READ(ifindex, sdata->name, "%s\n"); KEY_OPS(ifindex); static ssize_t key_algorithm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[15]; struct ieee80211_key *key = file->private_data; u32 c = key->conf.cipher; sprintf(buf, "%.2x-%.2x-%.2x:%d\n", c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } KEY_OPS(algorithm); static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; u64 pn; int ret; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: /* not supported yet */ return -EOPNOTSUPP; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: ret = kstrtou64_from_user(userbuf, count, 16, &pn); if (ret) return ret; /* PN is a 48-bit counter */ if (pn >= (1ULL << 48)) return -ERANGE; atomic64_set(&key->conf.tx_pn, pn); return count; default: return 0; } } static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { u64 pn; char buf[20]; int len; struct ieee80211_key *key = file->private_data; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%08x %04x\n", TKIP_PN_TO_IV32(pn), TKIP_PN_TO_IV16(pn)); break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS_W(tx_spec); static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[14*IEEE80211_NUM_TIDS+1], *p = buf; int i, len; const u8 *rpn; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: for (i = 0; i < IEEE80211_NUM_TIDS; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%08x %04x\n", key->u.tkip.rx[i].iv32, key->u.tkip.rx[i].iv16); len = p - buf; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.ccmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: rpn = key->u.aes_cmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: rpn = key->u.aes_gmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.gcmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(rx_spec); static ssize_t key_replays_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.replays); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.replays); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.gcmp.replays); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(replays); static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.icverrors); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.icverrors); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(icverrors); static ssize_t key_mic_failures_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; if (key->conf.cipher != WLAN_CIPHER_SUITE_TKIP) return -EINVAL; len = scnprintf(buf, sizeof(buf), "%u\n", key->u.tkip.mic_failures); return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(mic_failures); static ssize_t key_key_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; int i, bufsize = 2 * key->conf.keylen + 2; char *buf = kmalloc(bufsize, GFP_KERNEL); char *p = buf; ssize_t res; if (!buf) return -ENOMEM; for (i = 0; i < key->conf.keylen; i++) p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]); p += scnprintf(p, bufsize+buf-p, "\n"); res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return res; } KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ key, &key_##name##_ops) #define DEBUGFS_ADD_W(name) \ debugfs_create_file(#name, 0600, key->debugfs.dir, \ key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) { static int keycount; char buf[100]; struct sta_info *sta; if (!key->local->debugfs.keys) return; sprintf(buf, "%d", keycount); key->debugfs.cnt = keycount; keycount++; key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", sta->sdata->name, sta->sta.addr); key->debugfs.stalink = debugfs_create_symlink("station", key->debugfs.dir, buf); } DEBUGFS_ADD(keylen); DEBUGFS_ADD(flags); DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); DEBUGFS_ADD(algorithm); DEBUGFS_ADD_W(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); DEBUGFS_ADD(icverrors); DEBUGFS_ADD(mic_failures); DEBUGFS_ADD(key); DEBUGFS_ADD(ifindex); }; void ieee80211_debugfs_key_remove(struct ieee80211_key *key) { if (!key) return; debugfs_remove_recursive(key->debugfs.dir); key->debugfs.dir = NULL; } void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) { char buf[50]; struct ieee80211_key *key; if (!sdata->vif.debugfs_dir) return; lockdep_assert_wiphy(sdata->local->hw.wiphy); debugfs_remove(sdata->debugfs.default_unicast_key); sdata->debugfs.default_unicast_key = NULL; if (sdata->default_unicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->default_unicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_unicast_key = debugfs_create_symlink("default_unicast_key", sdata->vif.debugfs_dir, buf); } debugfs_remove(sdata->debugfs.default_multicast_key); sdata->debugfs.default_multicast_key = NULL; if (sdata->deflink.default_multicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->deflink.default_multicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_multicast_key = debugfs_create_symlink("default_multicast_key", sdata->vif.debugfs_dir, buf); } } void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_mgmt_key); sdata->debugfs.default_mgmt_key = NULL; } void ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_beacon_key); sdata->debugfs.default_beacon_key = NULL; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 2 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 // SPDX-License-Identifier: GPL-2.0-only /* * vsock sock_diag(7) module * * Copyright (C) 2017 Red Hat, Inc. * Author: Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/module.h> #include <linux/sock_diag.h> #include <linux/vm_sockets_diag.h> #include <net/af_vsock.h> static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u32 flags) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_diag_msg *rep; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->vdiag_family = AF_VSOCK; /* Lock order dictates that sk_lock is acquired before * vsock_table_lock, so we cannot lock here. Simply don't take * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is * held. */ rep->vdiag_type = sk->sk_type; rep->vdiag_state = sk->sk_state; rep->vdiag_shutdown = sk->sk_shutdown; rep->vdiag_src_cid = vsk->local_addr.svm_cid; rep->vdiag_src_port = vsk->local_addr.svm_port; rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; rep->vdiag_dst_port = vsk->remote_addr.svm_port; rep->vdiag_ino = sock_i_ino(sk); sock_diag_save_cookie(sk, rep->vdiag_cookie); return 0; } static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct vsock_diag_req *req; struct vsock_sock *vsk; unsigned int bucket; unsigned int last_i; unsigned int table; struct net *net; unsigned int i; req = nlmsg_data(cb->nlh); net = sock_net(skb->sk); /* State saved between calls: */ table = cb->args[0]; bucket = cb->args[1]; i = last_i = cb->args[2]; /* TODO VMCI pending sockets? */ spin_lock_bh(&vsock_table_lock); /* Bind table (locally created sockets) */ if (table == 0) { while (bucket < ARRAY_SIZE(vsock_bind_table)) { struct list_head *head = &vsock_bind_table[bucket]; i = 0; list_for_each_entry(vsk, head, bound_table) { struct sock *sk = sk_vsock(vsk); if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_bind; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_bind; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_bind: i++; } last_i = 0; bucket++; } table++; bucket = 0; } /* Connected table (accepted connections) */ while (bucket < ARRAY_SIZE(vsock_connected_table)) { struct list_head *head = &vsock_connected_table[bucket]; i = 0; list_for_each_entry(vsk, head, connected_table) { struct sock *sk = sk_vsock(vsk); /* Skip sockets we've already seen above */ if (__vsock_in_bound_table(vsk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_connected; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_connected; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_connected: i++; } last_i = 0; bucket++; } done: spin_unlock_bh(&vsock_table_lock); cb->args[0] = table; cb->args[1] = bucket; cb->args[2] = i; return skb->len; } static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct vsock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = vsock_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return -EOPNOTSUPP; } static const struct sock_diag_handler vsock_diag_handler = { .owner = THIS_MODULE, .family = AF_VSOCK, .dump = vsock_diag_handler_dump, }; static int __init vsock_diag_init(void) { return sock_diag_register(&vsock_diag_handler); } static void __exit vsock_diag_exit(void) { sock_diag_unregister(&vsock_diag_handler); } module_init(vsock_diag_init); module_exit(vsock_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 40 /* AF_VSOCK */);
2825 1593 1 1466 356 2 879 518 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for atomic bit * operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #include <linux/instrumented.h> /** * set_bit - Atomically set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void set_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_set_bit(nr, addr); } /** * clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). */ static __always_inline void clear_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_clear_bit(nr, addr); } /** * change_bit - Toggle a bit in memory * @nr: Bit to change * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void change_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_change_bit(nr, addr); } /** * test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit(nr, addr); } /** * test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_clear_bit(nr, addr); } /** * test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_change_bit(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
2 2 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/seq_file.h> #include <net/ip.h> #include <net/mptcp.h> #include <net/snmp.h> #include <net/net_namespace.h> #include "mib.h" static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK), SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX), SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX), SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE), SNMP_MIB_ITEM("MPCapableDataFallback", MPTCP_MIB_MPCAPABLEDATAFALLBACK), SNMP_MIB_ITEM("MD5SigFallback", MPTCP_MIB_MD5SIGFALLBACK), SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), }; /* mptcp_mib_alloc - allocate percpu mib counters * * These are allocated when the first mptcp socket is created so * we do not waste percpu memory if mptcp isn't in use. */ bool mptcp_mib_alloc(struct net *net) { struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); if (!mib) return false; if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) free_percpu(mib); return true; } void mptcp_seq_show(struct seq_file *seq) { unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)]; const int cnt = ARRAY_SIZE(mptcp_snmp_list); struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); for (i = 0; i < cnt; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt, net->mib.mptcp_statistics); for (i = 0; i < cnt; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); }
1 4 4 4 4 4 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is based in part on Andrew Moon's poly1305-donna, which is in the * public domain. */ #include <crypto/internal/poly1305.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/unaligned.h> void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) { u64 t0, t1; /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ t0 = get_unaligned_le64(&raw_key[0]); t1 = get_unaligned_le64(&raw_key[8]); key->key.r64[0] = t0 & 0xffc0fffffffULL; key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL; key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL; /* s = 20*r */ key->precomputed_s.r64[0] = key->key.r64[1] * 20; key->precomputed_s.r64[1] = key->key.r64[2] * 20; } EXPORT_SYMBOL(poly1305_core_setkey); void poly1305_core_blocks(struct poly1305_state *state, const struct poly1305_core_key *key, const void *src, unsigned int nblocks, u32 hibit) { const u8 *input = src; u64 hibit64; u64 r0, r1, r2; u64 s1, s2; u64 h0, h1, h2; u64 c; u128 d0, d1, d2, d; if (!nblocks) return; hibit64 = ((u64)hibit) << 40; r0 = key->key.r64[0]; r1 = key->key.r64[1]; r2 = key->key.r64[2]; h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; s1 = key->precomputed_s.r64[0]; s2 = key->precomputed_s.r64[1]; do { u64 t0, t1; /* h += m[i] */ t0 = get_unaligned_le64(&input[0]); t1 = get_unaligned_le64(&input[8]); h0 += t0 & 0xfffffffffffULL; h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64; /* h *= r */ d0 = (u128)h0 * r0; d = (u128)h1 * s2; d0 += d; d = (u128)h2 * s1; d0 += d; d1 = (u128)h0 * r1; d = (u128)h1 * r0; d1 += d; d = (u128)h2 * s2; d1 += d; d2 = (u128)h0 * r2; d = (u128)h1 * r1; d2 += d; d = (u128)h2 * r0; d2 += d; /* (partial) h %= p */ c = (u64)(d0 >> 44); h0 = (u64)d0 & 0xfffffffffffULL; d1 += c; c = (u64)(d1 >> 44); h1 = (u64)d1 & 0xfffffffffffULL; d2 += c; c = (u64)(d2 >> 42); h2 = (u64)d2 & 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 = h0 & 0xfffffffffffULL; h1 += c; input += POLY1305_BLOCK_SIZE; } while (--nblocks); state->h64[0] = h0; state->h64[1] = h1; state->h64[2] = h2; } EXPORT_SYMBOL(poly1305_core_blocks); void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst) { u8 *mac = dst; u64 h0, h1, h2, c; u64 g0, g1, g2; u64 t0, t1; /* fully carry h */ h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; /* compute h + -p */ g0 = h0 + 5; c = g0 >> 44; g0 &= 0xfffffffffffULL; g1 = h1 + c; c = g1 >> 44; g1 &= 0xfffffffffffULL; g2 = h2 + c - (1ULL << 42); /* select h if h < p, or h + -p if h >= p */ c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1; g0 &= c; g1 &= c; g2 &= c; c = ~c; h0 = (h0 & c) | g0; h1 = (h1 & c) | g1; h2 = (h2 & c) | g2; if (likely(nonce)) { /* h = (h + nonce) */ t0 = ((u64)nonce[1] << 32) | nonce[0]; t1 = ((u64)nonce[3] << 32) | nonce[2]; h0 += t0 & 0xfffffffffffULL; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c; h2 &= 0x3ffffffffffULL; } /* mac = h % (2^128) */ h0 = h0 | (h1 << 44); h1 = (h1 >> 20) | (h2 << 24); put_unaligned_le64(h0, &mac[0]); put_unaligned_le64(h1, &mac[8]); } EXPORT_SYMBOL(poly1305_core_emit);
6482 6503 6463 6502 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ /* * generic net pointers */ #ifndef __NET_GENERIC_H__ #define __NET_GENERIC_H__ #include <linux/bug.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> /* * Generic net pointers are to be used by modules to put some private * stuff on the struct net without explicit struct net modification * * The rules are simple: * 1. set pernet_operations->id. After register_pernet_device you * will have the id of your private pointer. * 2. set pernet_operations->size to have the code allocate and free * a private structure pointed to from struct net. * 3. do not change this pointer while the net is alive; * 4. do not try to have any private reference on the net_generic object. * * After accomplishing all of the above, the private pointer can be * accessed with the net_generic() call. */ struct net_generic { union { struct { unsigned int len; struct rcu_head rcu; } s; DECLARE_FLEX_ARRAY(void *, ptr); }; }; static inline void *net_generic(const struct net *net, unsigned int id) { struct net_generic *ng; void *ptr; rcu_read_lock(); ng = rcu_dereference(net->gen); ptr = ng->ptr[id]; rcu_read_unlock(); return ptr; } #endif
235 404 40 46 46 414 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 // SPDX-License-Identifier: GPL-2.0 #ifndef __KVM_X86_MMU_TDP_MMU_H #define __KVM_X86_MMU_TDP_MMU_H #include <linux/kvm_host.h> #include "spte.h" void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { return refcount_inc_not_zero(&root->tdp_mmu_root_count); } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); enum kvm_tdp_mmu_root_types { KVM_INVALID_ROOTS = BIT(0), KVM_DIRECT_ROOTS = BIT(1), KVM_MIRROR_ROOTS = BIT(2), KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS, KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS, }; static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm, enum kvm_gfn_range_filter process) { enum kvm_tdp_mmu_root_types ret = 0; if (!kvm_has_mirrored_tdp(kvm)) return KVM_DIRECT_ROOTS; if (process & KVM_FILTER_PRIVATE) ret |= KVM_MIRROR_ROOTS; if (process & KVM_FILTER_SHARED) ret |= KVM_DIRECT_ROOTS; WARN_ON_ONCE(!ret); return ret; } static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr))) return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); return root_to_sp(vcpu->arch.mmu->root.hpa); } static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, enum kvm_tdp_mmu_root_types type) { if (unlikely(type == KVM_MIRROR_ROOTS)) return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); return root_to_sp(vcpu->arch.mmu->root.hpa); } bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, enum kvm_tdp_mmu_root_types root_types); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level, bool shared); static inline void kvm_tdp_mmu_walk_lockless_begin(void) { rcu_read_lock(); } static inline void kvm_tdp_mmu_walk_lockless_end(void) { rcu_read_unlock(); } int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } #else static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* SPDX-License-Identifier: GPL-2.0 * * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. * Do not use in new code. */ #ifndef _BLK_CGROUP_RWSTAT_H #define _BLK_CGROUP_RWSTAT_H #include "blk-cgroup.h" enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, BLKG_RWSTAT_SYNC, BLKG_RWSTAT_ASYNC, BLKG_RWSTAT_DISCARD, BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, }; /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. */ struct blkg_rwstat { struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; struct blkg_rwstat_sample { u64 cnt[BLKG_RWSTAT_NR]; }; static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, unsigned int idx) { return atomic64_read(&rwstat->aux_cnt[idx]) + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); } int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); void blkg_rwstat_exit(struct blkg_rwstat *rwstat); u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat * @opf: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, blk_opf_t opf, uint64_t val) { struct percpu_counter *cnt; if (op_is_discard(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; else if (op_is_write(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); if (op_is_sync(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * @result: where to put the current values * * Read the current snapshot of @rwstat and return it in the @result counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, struct blkg_rwstat_sample *result) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) result->cnt[i] = percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); } /** * blkg_rwstat_total - read the total count of a blkg_rwstat * @rwstat: blkg_rwstat to read * * Return the total count of @rwstat regardless of the IO direction. This * function can be called without synchronization and takes care of u64 * atomicity. */ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat_sample tmp = { }; blkg_rwstat_read(rwstat, &tmp); return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; } /** * blkg_rwstat_reset - reset a blkg_rwstat * @rwstat: blkg_rwstat to reset */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) { percpu_counter_set(&rwstat->cpu_cnt[i], 0); atomic64_set(&rwstat->aux_cnt[i], 0); } } /** * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * * Add @from's count including the aux one to @to's aux count. */ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, struct blkg_rwstat *from) { u64 sum[BLKG_RWSTAT_NR]; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); for (i = 0; i < BLKG_RWSTAT_NR; i++) atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } #endif /* _BLK_CGROUP_RWSTAT_H */
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __PSP_PSP_H #define __PSP_PSP_H #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <net/netns/generic.h> #include <net/psp.h> #include <net/sock.h> extern struct xarray psp_devs; extern struct mutex psp_devs_lock; void psp_dev_free(struct psp_dev *psd); int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); struct psp_assoc *psp_assoc_create(struct psp_dev *psd); struct psp_dev *psp_dev_get_for_sock(struct sock *sk); void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, struct psp_key_parsed *key, struct netlink_ext_ack *extack); int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } static inline bool psp_dev_tryget(struct psp_dev *psd) { return refcount_inc_not_zero(&psd->refcnt); } static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_free(psd); } static inline bool psp_dev_is_registered(struct psp_dev *psd) { lockdep_assert_held(&psd->lock); return !!psd->ops; } #endif /* __PSP_PSP_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 #endif #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; while (gap > 0) { unsigned int inc; inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); seq_write(p, zeros, 2 * inc); gap -= inc; } } static void show_all_irqs(struct seq_file *p) { unsigned int i, next = 0; for_each_active_irq(i) { show_irq_gap(p, i - next); seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } show_irq_gap(p, irq_get_nr_irqs() - next); } static int show_stat(struct seq_file *p, void *v) { int i, j; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); /* shift boot timestamp according to the timens offset */ timens_sub_boottime(&boottime); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; nice += cpustat[CPUTIME_NICE]; system += cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(&kcpustat, i); iowait += get_iowait_time(&kcpustat, i); irq += cpustat[CPUTIME_IRQ]; softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); per_softirq_sums[j] += softirq_stat; sum_softirq += softirq_stat; } } sum += arch_irq_stat(); seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = cpustat[CPUTIME_USER]; nice = cpustat[CPUTIME_NICE]; system = cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(&kcpustat, i); iowait = get_iowait_time(&kcpustat, i); irq = cpustat[CPUTIME_IRQ]; softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" "procs_running %u\n" "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * irq_get_nr_irqs(); return single_open_size(file, show_stat, NULL, size); } static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; static int __init proc_stat_init(void) { proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init);
1 1 4 4 2 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer device management * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> * *---------------------------------------------------------------- * * This device handler separates the card driver module from sequencer * stuff (sequencer core, synth drivers, etc), so that user can avoid * to spend unnecessary resources e.g. if he needs only listening to * MP3s. * * The card (or lowlevel) driver creates a sequencer device entry * via snd_seq_device_new(). This is an entry pointer to communicate * with the sequencer device "driver", which is involved with the * actual part to communicate with the sequencer core. * Each sequencer device entry has an id string and the corresponding * driver with the same id is loaded when required. For example, * lowlevel codes to access emu8000 chip on sbawe card are included in * emu8000-synth module. To activate this module, the hardware * resources like i/o port are passed via snd_seq_device argument. */ #include <linux/device.h> #include <linux/init.h> #include <linux/module.h> #include <sound/core.h> #include <sound/info.h> #include <sound/seq_device.h> #include <sound/seq_kernel.h> #include <sound/initval.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/mutex.h> MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("ALSA sequencer device management"); MODULE_LICENSE("GPL"); /* * bus definition */ static int snd_seq_bus_match(struct device *dev, const struct device_driver *drv) { struct snd_seq_device *sdev = to_seq_dev(dev); const struct snd_seq_driver *sdrv = to_seq_drv(drv); return strcmp(sdrv->id, sdev->id) == 0 && sdrv->argsize == sdev->argsize; } static const struct bus_type snd_seq_bus_type = { .name = "snd_seq", .match = snd_seq_bus_match, }; /* * proc interface -- just for compatibility */ #ifdef CONFIG_SND_PROC_FS static struct snd_info_entry *info_entry; static int print_dev_info(struct device *dev, void *data) { struct snd_seq_device *sdev = to_seq_dev(dev); struct snd_info_buffer *buffer = data; snd_iprintf(buffer, "snd-%s,%s,%d\n", sdev->id, dev->driver ? "loaded" : "empty", dev->driver ? 1 : 0); return 0; } static void snd_seq_device_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { bus_for_each_dev(&snd_seq_bus_type, NULL, buffer, print_dev_info); } #endif /* * load all registered drivers (called from seq_clientmgr.c) */ #ifdef CONFIG_MODULES /* flag to block auto-loading */ static atomic_t snd_seq_in_init = ATOMIC_INIT(1); /* blocked as default */ static int request_seq_drv(struct device *dev, void *data) { struct snd_seq_device *sdev = to_seq_dev(dev); if (!dev->driver) request_module("snd-%s", sdev->id); return 0; } static void autoload_drivers(struct work_struct *work) { /* avoid reentrance */ if (atomic_inc_return(&snd_seq_in_init) == 1) bus_for_each_dev(&snd_seq_bus_type, NULL, NULL, request_seq_drv); atomic_dec(&snd_seq_in_init); } static DECLARE_WORK(autoload_work, autoload_drivers); static void queue_autoload_drivers(void) { schedule_work(&autoload_work); } void snd_seq_autoload_init(void) { atomic_dec(&snd_seq_in_init); #ifdef CONFIG_SND_SEQUENCER_MODULE /* initial autoload only when snd-seq is a module */ queue_autoload_drivers(); #endif } EXPORT_SYMBOL(snd_seq_autoload_init); void snd_seq_autoload_exit(void) { atomic_inc(&snd_seq_in_init); } EXPORT_SYMBOL(snd_seq_autoload_exit); void snd_seq_device_load_drivers(void) { queue_autoload_drivers(); flush_work(&autoload_work); } EXPORT_SYMBOL(snd_seq_device_load_drivers); static inline void cancel_autoload_drivers(void) { cancel_work_sync(&autoload_work); } #else static inline void queue_autoload_drivers(void) { } static inline void cancel_autoload_drivers(void) { } #endif /* * device management */ static int snd_seq_device_dev_free(struct snd_device *device) { struct snd_seq_device *dev = device->device_data; cancel_autoload_drivers(); if (dev->private_free) dev->private_free(dev); put_device(&dev->dev); return 0; } static int snd_seq_device_dev_register(struct snd_device *device) { struct snd_seq_device *dev = device->device_data; int err; err = device_add(&dev->dev); if (err < 0) return err; if (!dev->dev.driver) queue_autoload_drivers(); return 0; } static int snd_seq_device_dev_disconnect(struct snd_device *device) { struct snd_seq_device *dev = device->device_data; device_del(&dev->dev); return 0; } static void snd_seq_dev_release(struct device *dev) { kfree(to_seq_dev(dev)); } /* * register a sequencer device * card = card info * device = device number (if any) * id = id of driver * result = return pointer (NULL allowed if unnecessary) */ int snd_seq_device_new(struct snd_card *card, int device, const char *id, int argsize, struct snd_seq_device **result) { struct snd_seq_device *dev; int err; static const struct snd_device_ops dops = { .dev_free = snd_seq_device_dev_free, .dev_register = snd_seq_device_dev_register, .dev_disconnect = snd_seq_device_dev_disconnect, }; if (result) *result = NULL; if (snd_BUG_ON(!id)) return -EINVAL; dev = kzalloc(sizeof(*dev) + argsize, GFP_KERNEL); if (!dev) return -ENOMEM; /* set up device info */ dev->card = card; dev->device = device; dev->id = id; dev->argsize = argsize; device_initialize(&dev->dev); dev->dev.parent = &card->card_dev; dev->dev.bus = &snd_seq_bus_type; dev->dev.release = snd_seq_dev_release; dev_set_name(&dev->dev, "%s-%d-%d", dev->id, card->number, device); /* add this device to the list */ err = snd_device_new(card, SNDRV_DEV_SEQUENCER, dev, &dops); if (err < 0) { put_device(&dev->dev); return err; } if (result) *result = dev; return 0; } EXPORT_SYMBOL(snd_seq_device_new); /* * driver registration */ int __snd_seq_driver_register(struct snd_seq_driver *drv, struct module *mod) { if (WARN_ON(!drv->driver.name || !drv->id)) return -EINVAL; drv->driver.bus = &snd_seq_bus_type; drv->driver.owner = mod; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__snd_seq_driver_register); void snd_seq_driver_unregister(struct snd_seq_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(snd_seq_driver_unregister); /* * module part */ static int __init seq_dev_proc_init(void) { #ifdef CONFIG_SND_PROC_FS info_entry = snd_info_create_module_entry(THIS_MODULE, "drivers", snd_seq_root); if (info_entry == NULL) return -ENOMEM; info_entry->content = SNDRV_INFO_CONTENT_TEXT; info_entry->c.text.read = snd_seq_device_info; if (snd_info_register(info_entry) < 0) { snd_info_free_entry(info_entry); return -ENOMEM; } #endif return 0; } static int __init alsa_seq_device_init(void) { int err; err = bus_register(&snd_seq_bus_type); if (err < 0) return err; err = seq_dev_proc_init(); if (err < 0) bus_unregister(&snd_seq_bus_type); return err; } static void __exit alsa_seq_device_exit(void) { #ifdef CONFIG_MODULES cancel_work_sync(&autoload_work); #endif #ifdef CONFIG_SND_PROC_FS snd_info_free_entry(info_entry); #endif bus_unregister(&snd_seq_bus_type); } subsys_initcall(alsa_seq_device_init) module_exit(alsa_seq_device_exit)
2 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match ROUTING parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_rt.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the id is matched by the range, 0 otherwise */ static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { return (id >= min && id <= max) ^ invert; } static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_rt_hdr _route; const struct ipv6_rt_hdr *rh; const struct ip6t_rt *rtinfo = par->matchinfo; unsigned int temp; unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; struct in6_addr _addr; const struct in6_addr *ap; int err; err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); if (rh == NULL) { par->hotdrop = true; return false; } hdrlen = ipv6_optlen(rh); if (skb->len - ptr < hdrlen) { /* Pcket smaller than its length field */ return false; } ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && (!(rtinfo->flags & IP6T_RT_LEN) || ((rtinfo->hdrlen == hdrlen) ^ !!(rtinfo->invflags & IP6T_RT_INV_LEN))) && (!(rtinfo->flags & IP6T_RT_TYP) || ((rtinfo->rt_type == rh->type) ^ !!(rtinfo->invflags & IP6T_RT_INV_TYP))); if (ret && (rtinfo->flags & IP6T_RT_RES)) { const u_int32_t *rp; u_int32_t _reserved; rp = skb_header_pointer(skb, ptr + offsetof(struct rt0_hdr, reserved), sizeof(_reserved), &_reserved); if (!rp) { par->hotdrop = true; return false; } ret = (*rp == 0); } if (!(rtinfo->flags & IP6T_RT_FST)) { return ret; } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { return false; } else { unsigned int i = 0; for (temp = 0; temp < (unsigned int)((hdrlen - 8) / 16); temp++) { ap = skb_header_pointer(skb, ptr + sizeof(struct rt0_hdr) + temp * sizeof(_addr), sizeof(_addr), &_addr); if (ap == NULL) { par->hotdrop = true; return false; } if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) i++; if (i == rtinfo->addrnr) break; } if (i == rtinfo->addrnr) return ret; else return false; } } else { if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { return false; } else { for (temp = 0; temp < rtinfo->addrnr; temp++) { ap = skb_header_pointer(skb, ptr + sizeof(struct rt0_hdr) + temp * sizeof(_addr), sizeof(_addr), &_addr); if (ap == NULL) { par->hotdrop = true; return false; } if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; } if (temp == rtinfo->addrnr && temp == (unsigned int)((hdrlen - 8) / 16)) return ret; else return false; } } return false; } static int rt_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_rt *rtinfo = par->matchinfo; if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { pr_debug("unknown flags %X\n", rtinfo->invflags); return -EINVAL; } if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && (!(rtinfo->flags & IP6T_RT_TYP) || (rtinfo->rt_type != 0) || (rtinfo->invflags & IP6T_RT_INV_TYP))) { pr_debug("`--rt-type 0' required before `--rt-0-*'"); return -EINVAL; } return 0; } static struct xt_match rt_mt6_reg __read_mostly = { .name = "rt", .family = NFPROTO_IPV6, .match = rt_mt6, .matchsize = sizeof(struct ip6t_rt), .checkentry = rt_mt6_check, .me = THIS_MODULE, }; static int __init rt_mt6_init(void) { return xt_register_match(&rt_mt6_reg); } static void __exit rt_mt6_exit(void) { xt_unregister_match(&rt_mt6_reg); } module_init(rt_mt6_init); module_exit(rt_mt6_exit);
1 8 8 8 1 8 8 8 8 8 8 8 8 8 3 3 3 3 4 4 3 1 5 5 1 1 1 3 4 4 3 3 3 3 3 1 3 1 3 3 3 3 3 2 1 3 3 3 1 14 14 14 14 14 14 1 1 1 3 3 3 3 3 3 3 1 2 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 1 2 3 1 18 15 10 3 1 2 3 3 3 3 3 1 18 18 3 1 17 2 1 1 1 13 3 13 10 5 13 3 10 5 3 5 1 1 1 1 1 4 2 1 1 1 1 1 12 5 12 5 11 2 1 1 1 1 1 3 3 3 3 3 3 2 2 2 2 3 14 14 14 14 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 // SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier Changes: - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> - lots of fixes and improvements to the RAID1/RAID5 and generic RAID code (such as request based resynchronization): Neil Brown <neilb@cse.unsw.edu.au>. - persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. Errors, Warnings, etc. Please use: pr_crit() for error conditions that risk data loss pr_err() for error conditions that are unexpected, like an IO error or internal inconsistency pr_warn() for error conditions that could have been predicated, like adding a device to an array when it has incompatible metadata pr_info() for every interesting, very rare events, like an array starting or stopping, or resync starting or stopping pr_debug() for everything else. */ #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/hdreg.h> #include <linux/proc_fs.h> #include <linux/random.h> #include <linux/major.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> #include <linux/raid/detect.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/part_stat.h> #include "md.h" #include "md-bitmap.h" #include "md-cluster.h" static const char *action_name[NR_SYNC_ACTIONS] = { [ACTION_RESYNC] = "resync", [ACTION_RECOVER] = "recover", [ACTION_CHECK] = "check", [ACTION_REPAIR] = "repair", [ACTION_RESHAPE] = "reshape", [ACTION_FROZEN] = "frozen", [ACTION_IDLE] = "idle", }; static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; /* * This workqueue is used for sync_work to register new sync_thread, and for * del_work to remove rdev, and for event_work that is only set by dm-raid. * * Noted that sync_work will grab reconfig_mutex, hence never flush this * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load * does not show up that much. Increase it if you want to have more guaranteed * speed. Note that the RAID driver will use the maximum bandwidth * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. * * Background sync IO speed control: * * - below speed min: * no limit; * - above speed min and below speed max: * a) if mddev is idle, then no limit; * b) if mddev is busy handling normal IO, then limit inflight sync IO * to sync_io_depth; * - above speed max: * sync IO can't be issued; * * Following configurations can be changed via /proc/sys/dev/raid/ for system * or /sys/block/mdX/md/ for one array. */ static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; static int sysctl_sync_io_depth = 32; static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } static int sync_io_depth(struct mddev *mddev) { return mddev->sync_io_depth ? mddev->sync_io_depth : sysctl_sync_io_depth; } static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; kvfree(rdev->serial); rdev->serial = NULL; } static void rdevs_uninit_serial(struct mddev *mddev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) rdev_uninit_serial(rdev); } static int rdev_init_serial(struct md_rdev *rdev) { /* serial_nums equals with BARRIER_BUCKETS_NR */ int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); struct serial_in_rdev *serial = NULL; if (test_bit(CollisionCheck, &rdev->flags)) return 0; serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, GFP_KERNEL); if (!serial) return -ENOMEM; for (i = 0; i < serial_nums; i++) { struct serial_in_rdev *serial_tmp = &serial[i]; spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; set_bit(CollisionCheck, &rdev->flags); return 0; } static int rdevs_init_serial(struct mddev *mddev) { struct md_rdev *rdev; int ret = 0; rdev_for_each(rdev, mddev) { ret = rdev_init_serial(rdev); if (ret) break; } /* Free all resources if pool is not existed */ if (ret && !mddev->serial_info_pool) rdevs_uninit_serial(mddev); return ret; } /* * rdev needs to enable serial stuffs if it meets the conditions: * 1. it is multi-queue device flaged with writemostly. * 2. the write-behind mode is enabled. */ static int rdev_need_serial(struct md_rdev *rdev) { return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && test_bit(WriteMostly, &rdev->flags)); } /* * Init resource for rdev(s), then create serial_info_pool if: * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; if (rdev && !rdev_need_serial(rdev) && !test_bit(CollisionCheck, &rdev->flags)) return; if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) return; if (mddev->serial_info_pool == NULL) { /* * already in memalloc noio context by * mddev_suspend() */ mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) { rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); } } } /* * Free resource from rdev(s), and destroy serial_info_pool under conditions: * 1. rdev is the last device flaged with CollisionCheck. * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || !rdev_need_serial(temp)) rdev_uninit_serial(temp); else num++; } else if (temp != rdev && test_bit(CollisionCheck, &temp->flags)) num++; } if (rdev) rdev_uninit_serial(rdev); if (num) pr_info("The mempool could be used by other devices\n"); else { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } } } static struct ctl_table_header *raid_table_header; static const struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_io_depth", .data = &sysctl_sync_io_depth, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int start_readonly; /* * The original mechanism for creating an md device is to create * a device node in /dev and to open it. This causes races with device-close. * The preferred method is to write to the "new_array" module parameter. * This can avoid races. * Setting create_on_open to false disables the original mechanism * so all the races disappear. */ static bool create_on_open = true; static bool legacy_async_del_gendisk = true; /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: * start array, stop array, error, add device, remove device, * start build, activate spare */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; void md_new_event(void) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); } EXPORT_SYMBOL_GPL(md_new_event); /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. */ static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); static bool is_md_suspended(struct mddev *mddev) { return percpu_ref_is_dying(&mddev->active_io); } /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. * We hold a refcount over the call to ->make_request. By the time that * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ static bool is_suspended(struct mddev *mddev, struct bio *bio) { if (is_md_suspended(mddev)) return true; if (bio_data_dir(bio) != WRITE) return false; if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } bool md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { bio_wouldblock_error(bio); return true; } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break; schedule(); } finish_wait(&mddev->sb_wait, &__wait); } if (!percpu_ref_tryget_live(&mddev->active_io)) goto check_suspended; if (!mddev->pers->make_request(mddev, bio)) { percpu_ref_put(&mddev->active_io); if (!mddev->gendisk && mddev->pers->prepare_suspend) return false; goto check_suspended; } percpu_ref_put(&mddev->active_io); return true; } EXPORT_SYMBOL(md_handle_request); static void md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return; } if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return; } bio = bio_split_to_limits(bio); if (!bio) return; if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); } /* * Make sure no new requests are submitted to the device, and any requests that * have been submitted are completely handled. */ int mddev_suspend(struct mddev *mddev, bool interruptible) { int err = 0; /* * hold reconfig_mutex to wait for normal io will deadlock, because * other context can't update super_block, and normal io can rely on * updating super_block. */ lockdep_assert_not_held(&mddev->reconfig_mutex); if (interruptible) err = mutex_lock_interruptible(&mddev->suspend_mutex); else mutex_lock(&mddev->suspend_mutex); if (err) return err; if (mddev->suspended) { WRITE_ONCE(mddev->suspended, mddev->suspended + 1); mutex_unlock(&mddev->suspend_mutex); return 0; } percpu_ref_kill(&mddev->active_io); if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); else wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); if (err) { percpu_ref_resurrect(&mddev->active_io); mutex_unlock(&mddev->suspend_mutex); return err; } /* * For raid456, io might be waiting for reshape to make progress, * allow new reshape to start while waiting for io to be done to * prevent deadlock. */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); mutex_unlock(&mddev->suspend_mutex); return 0; } EXPORT_SYMBOL_GPL(mddev_suspend); static void __mddev_resume(struct mddev *mddev, bool recovery_needed) { lockdep_assert_not_held(&mddev->reconfig_mutex); mutex_lock(&mddev->suspend_mutex); WRITE_ONCE(mddev->suspended, mddev->suspended - 1); if (mddev->suspended) { mutex_unlock(&mddev->suspend_mutex); return; } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); if (recovery_needed) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ mutex_unlock(&mddev->suspend_mutex); } void mddev_resume(struct mddev *mddev) { return __mddev_resume(mddev, true); } EXPORT_SYMBOL_GPL(mddev_resume); /* sync bdev before setting device to readonly or stopping raid*/ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) { mutex_lock(&mddev->open_mutex); if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { mutex_unlock(&mddev->open_mutex); return -EBUSY; } if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { mutex_unlock(&mddev->open_mutex); return -EBUSY; } mutex_unlock(&mddev->open_mutex); sync_blockdev(mddev->gendisk->part0); return 0; } /* * The only difference from bio_chain_endio() is that the current * bi_status of bio does not affect the bi_status of parent. */ static void md_end_flush(struct bio *bio) { struct bio *parent = bio->bi_private; /* * If any flush io error before the power failure, * disk data may be lost. */ if (bio->bi_status) pr_err("md: %pg flush io error %d\n", bio->bi_bdev, blk_status_to_errno(bio->bi_status)); bio_put(bio); bio_endio(parent); } bool md_flush_request(struct mddev *mddev, struct bio *bio) { struct md_rdev *rdev; struct bio *new; /* * md_flush_reqeust() should be called under md_handle_request() and * 'active_io' is already grabbed. Hence it's safe to get rdev directly * without rcu protection. */ WARN_ON(percpu_ref_is_zero(&mddev->active_io)); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) continue; new = bio_alloc_bioset(rdev->bdev, 0, REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, &mddev->bio_set); new->bi_private = bio; new->bi_end_io = md_end_flush; bio_inc_remaining(bio); submit_bio(new); } if (bio_sectors(bio) == 0) { bio_endio(bio); return true; } bio->bi_opf &= ~REQ_PREFLUSH; return false; } EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { lockdep_assert_held(&all_mddevs_lock); if (test_bit(MD_DELETED, &mddev->flags)) return NULL; atomic_inc(&mddev->active); return mddev; } static void mddev_delayed_delete(struct work_struct *ws); static void __mddev_put(struct mddev *mddev) { if (mddev->raid_disks || !list_empty(&mddev->disks) || mddev->ctime || mddev->hold_active) return; /* * If array is freed by stopping array, MD_DELETED is set by * do_md_stop(), MD_DELETED is still set here in case mddev is freed * directly by closing a mddev that is created by create_on_open. */ set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. */ queue_work(md_misc_wq, &mddev->del_work); } static void mddev_put_locked(struct mddev *mddev) { if (atomic_dec_and_test(&mddev->active)) __mddev_put(mddev); } void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } static void md_safemode_timeout(struct timer_list *t); static void md_start_sync(struct work_struct *ws); static void active_io_release(struct percpu_ref *ref) { struct mddev *mddev = container_of(ref, struct mddev, active_io); wake_up(&mddev->sb_wait); } static void no_op(struct percpu_ref *r) {} static bool mddev_set_bitmap_ops(struct mddev *mddev) { struct bitmap_operations *old = mddev->bitmap_ops; struct md_submodule_head *head; if (mddev->bitmap_id == ID_BITMAP_NONE || (old && old->head.id == mddev->bitmap_id)) return true; xa_lock(&md_submodule); head = xa_load(&md_submodule, mddev->bitmap_id); if (!head) { pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); goto err; } if (head->type != MD_BITMAP) { pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); goto err; } mddev->bitmap_ops = (void *)head; xa_unlock(&md_submodule); if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) pr_warn("md: cannot register extra bitmap attributes for %s\n", mdname(mddev)); else /* * Inform user with KOBJ_CHANGE about new bitmap * attributes. */ kobject_uevent(&mddev->kobj, KOBJ_CHANGE); } return true; err: xa_unlock(&md_submodule); return false; } static void mddev_clear_bitmap_ops(struct mddev *mddev) { if (!mddev_is_dm(mddev) && mddev->bitmap_ops && mddev->bitmap_ops->group) sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); mddev->bitmap_ops = NULL; } int mddev_init(struct mddev *mddev) { if (!IS_ENABLED(CONFIG_MD_BITMAP)) mddev->bitmap_id = ID_BITMAP_NONE; else mddev->bitmap_id = ID_BITMAP; if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) return -ENOMEM; if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { percpu_ref_exit(&mddev->active_io); return -ENOMEM; } /* We want to start with the refcount at zero */ percpu_ref_put(&mddev->writes_pending); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; mddev->last_sync_action = ACTION_IDLE; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); return 0; } EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } EXPORT_SYMBOL_GPL(mddev_destroy); static struct mddev *mddev_find_locked(dev_t unit) { struct mddev *mddev; list_for_each_entry(mddev, &all_mddevs, all_mddevs) if (mddev->unit == unit) return mddev; return NULL; } /* find an unused unit number */ static dev_t mddev_alloc_unit(void) { static int next_minor = 512; int start = next_minor; bool is_free = 0; dev_t dev = 0; while (!is_free) { dev = MKDEV(MD_MAJOR, next_minor); next_minor++; if (next_minor > MINORMASK) next_minor = 0; if (next_minor == start) return 0; /* Oh dear, all in use. */ is_free = !mddev_find_locked(dev); } return dev; } static struct mddev *mddev_alloc(dev_t unit) { struct mddev *new; int error; if (unit && MAJOR(unit) != MD_MAJOR) unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); error = mddev_init(new); if (error) goto out_free_new; spin_lock(&all_mddevs_lock); if (unit) { error = -EEXIST; if (mddev_find_locked(unit)) goto out_destroy_new; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); else new->md_minor = MINOR(unit) >> MdpMinorShift; new->hold_active = UNTIL_IOCTL; } else { error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) goto out_destroy_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; } list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; out_destroy_new: spin_unlock(&all_mddevs_lock); mddev_destroy(new); out_free_new: kfree(new); return ERR_PTR(error); } static void mddev_free(struct mddev *mddev) { spin_lock(&all_mddevs_lock); list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); mddev_destroy(mddev); kfree(mddev); } static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) { struct md_rdev *rdev; struct md_rdev *tmp; LIST_HEAD(delete); if (!list_empty(&mddev->deleting)) list_splice_init(&mddev->deleting, &delete); if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. * So hold set sysfs_active while the remove in happeing, * and anything else which might set ->to_remove or my * otherwise change the sysfs namespace will fail with * -EBUSY if sysfs_active is still set. * We set sysfs_active under reconfig_mutex and elsewhere * test it under the same mutex to ensure its correct value * is seen. */ const struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); if (mddev->kobj.sd) { if (to_remove != &md_redundancy_group) sysfs_remove_group(&mddev->kobj, to_remove); if (mddev->pers == NULL || mddev->pers->sync_request == NULL) { sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); if (mddev->sysfs_completed) sysfs_put(mddev->sysfs_completed); if (mddev->sysfs_degraded) sysfs_put(mddev->sysfs_degraded); mddev->sysfs_action = NULL; mddev->sysfs_completed = NULL; mddev->sysfs_degraded = NULL; } } mddev->sysfs_active = 0; } else mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } if (!legacy_async_del_gendisk) { /* * Call del_gendisk after release reconfig_mutex to avoid * deadlock (e.g. call del_gendisk under the lock and an * access to sysfs files waits the lock) * And MD_DELETED is only used for md raid which is set in * do_md_stop. dm raid only uses md_stop to stop. So dm raid * doesn't need to check MD_DELETED when getting reconfig lock */ if (test_bit(MD_DELETED, &mddev->flags)) del_gendisk(mddev->gendisk); } } EXPORT_SYMBOL_GPL(mddev_unlock); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; rdev_for_each_rcu(rdev, mddev) if (rdev->desc_nr == nr) return rdev; return NULL; } EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; rdev_for_each_rcu(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; return NULL; } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); static struct md_personality *get_pers(int level, char *clevel) { struct md_personality *ret = NULL; struct md_submodule_head *head; unsigned long i; xa_lock(&md_submodule); xa_for_each(&md_submodule, i, head) { if (head->type != MD_PERSONALITY) continue; if ((level != LEVEL_NONE && head->id == level) || !strcmp(head->name, clevel)) { if (try_module_get(head->owner)) ret = (void *)head; break; } } xa_unlock(&md_submodule); if (!ret) { if (level != LEVEL_NONE) pr_warn("md: personality for level %d is not loaded!\n", level); else pr_warn("md: personality for level %s is not loaded!\n", clevel); } return ret; } static void put_pers(struct md_personality *pers) { module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); } static int alloc_disk_sb(struct md_rdev *rdev) { rdev->sb_page = alloc_page(GFP_KERNEL); if (!rdev->sb_page) return -ENOMEM; return 0; } void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; rdev->sectors = 0; } if (rdev->bb_page) { put_page(rdev->bb_page); rdev->bb_page = NULL; } badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio) { struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; if (bio->bi_status) { pr_err("md: %s gets error=%d\n", __func__, blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); set_bit(LastDev, &rdev->flags); } } else clear_bit(LastDev, &rdev->flags); bio_put(bio); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); } /** * md_write_metadata - write metadata to underlying disk, including * array superblock, badblocks, bitmap superblock and bitmap bits. * @mddev: the array to write * @rdev: the underlying disk to write * @sector: the offset to @rdev * @size: the length of the metadata * @page: the metadata * @offset: the offset to @page * * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment * mddev->pending_writes before returning, and decrement it on completion, * waking up sb_wait. Caller must call md_super_wait() after issuing io to all * rdev. If an error occurred, md_error() will be called, and the @rdev will be * kicked out from @mddev. */ void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page, unsigned int offset) { struct bio *bio; if (!page) return; if (test_bit(Faulty, &rdev->flags)) return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META | REQ_PREFLUSH | REQ_FUA, GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && test_bit(FailFast, &rdev->flags) && !test_bit(LastDev, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; atomic_inc(&mddev->pending_writes); submit_bio(bio); } int md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) return -EAGAIN; return 0; } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op) { struct bio bio; struct bio_vec bvec; if (metadata_op && rdev->meta_bdev) bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); else bio_init(&bio, rdev->bdev, &bvec, 1, opf); if (metadata_op) bio.bi_iter.bi_sector = sector + rdev->sb_start; else if (rdev->mddev->reshape_position != MaxSector && (rdev->mddev->reshape_backwards == (sector >= rdev->mddev->reshape_position))) bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else bio.bi_iter.bi_sector = sector + rdev->data_offset; __bio_add_page(&bio, page, size, 0); submit_bio_wait(&bio); return !bio.bi_status; } EXPORT_SYMBOL_GPL(sync_page_io); static int read_disk_sb(struct md_rdev *rdev, int size) { if (rdev->sb_loaded) return 0; if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) goto fail; rdev->sb_loaded = 1; return 0; fail: pr_err("md: disabled device %pg, could not read superblock.\n", rdev->bdev); return -EINVAL; } static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { return sb1->set_uuid0 == sb2->set_uuid0 && sb1->set_uuid1 == sb2->set_uuid1 && sb1->set_uuid2 == sb2->set_uuid2 && sb1->set_uuid3 == sb2->set_uuid3; } static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) { int ret; mdp_super_t *tmp1, *tmp2; tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); if (!tmp1 || !tmp2) { ret = 0; goto abort; } *tmp1 = *sb1; *tmp2 = *sb2; /* * nr_disks is not constant */ tmp1->nr_disks = 0; tmp2->nr_disks = 0; ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); abort: kfree(tmp1); kfree(tmp2); return ret; } static u32 md_csum_fold(u32 csum) { csum = (csum & 0xffff) + (csum >> 16); return (csum & 0xffff) + (csum >> 16); } static unsigned int calc_sb_csum(mdp_super_t *sb) { u64 newcsum = 0; u32 *sb32 = (u32*)sb; int i; unsigned int disk_csum, csum; disk_csum = sb->sb_csum; sb->sb_csum = 0; for (i = 0; i < MD_SB_BYTES/4 ; i++) newcsum += sb32[i]; csum = (newcsum & 0xffffffff) + (newcsum>>32); #ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on * different architectures. It isn't critical that we get exactly * the same return value as before (we always csum_fold before * testing, and that removes any differences). However as we * know that csum_partial always returned a 16bit value on * alphas, do a fold to maximise conformity to previous behaviour. */ sb->sb_csum = md_csum_fold(disk_csum); #else sb->sb_csum = disk_csum; #endif return csum; } /* * Handle superblock details. * We want to be able to handle multiple superblock formats * so we have a common interface to them all, and an array of * different handlers. * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: * 0 - dev has a superblock that is compatible with refdev * 1 - dev has a superblock that is compatible and newer than refdev * so dev should be used as the refdev in future * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. * */ struct super_type { char *name; struct module *owner; int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); int (*allow_new_offset)(struct md_rdev *rdev, unsigned long long new_offset); }; /* * Check that the given mddev has no bitmap. * * This function is called from the run method of all personalities that do not * support bitmaps. It prints an error message and returns non-zero if mddev * has a bitmap. Otherwise, it returns 0. * */ int md_check_no_bitmap(struct mddev *mddev) { if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); /* * load_super for 0.90.0 */ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { mdp_super_t *sb; int ret; bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), * it's at the end of the disk. * * It also happens to be a multiple of 4Kb. */ rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; ret = -EINVAL; sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { pr_warn("md: invalid raid superblock magic on %pg\n", rdev->bdev); goto abort; } if (sb->major_version != 0 || sb->minor_version < 90 || sb->minor_version > 91) { pr_warn("Bad version number %d.%d on %pg\n", sb->major_version, sb->minor_version, rdev->bdev); goto abort; } if (sb->raid_disks <= 0) goto abort; if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort; } rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; rdev->desc_nr = sb->this_disk.number; /* not spare disk */ if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) spare_disk = false; if (!refdev) { if (!spare_disk) ret = 1; else ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); if (!md_uuid_equal(refsb, sb)) { pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, refdev->bdev); goto abort; } if (!md_sb_equal(refsb, sb)) { pr_warn("md: %pg has same UUID but different superblock to %pg\n", rdev->bdev, refdev->bdev); goto abort; } ev1 = md_event(sb); ev2 = md_event(refsb); if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } rdev->sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; abort: return ret; } static u64 md_bitmap_events_cleared(struct mddev *mddev) { struct md_bitmap_stats stats; int err; if (!md_bitmap_enabled(mddev, false)) return 0; err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; return stats.events_cleared; } /* * validate_super for 0.90.0 * note: we are not using "freshest" for 0.9 superblock */ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; /* bitmap can use 60 K after the 4K superblocks */ mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; mddev->new_chunk_sectors = sb->new_chunk >> 9; if (mddev->delta_disks < 0) mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } if (mddev->level == 0) mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->resync_offset = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { mddev->resync_offset = sb->recovery_cp; } else mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); memcpy(mddev->uuid+4, &sb->set_uuid1, 4); memcpy(mddev->uuid+8, &sb->set_uuid2, 4); memcpy(mddev->uuid+12,&sb->set_uuid3, 4); mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; } } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling, except * for spares (which don't need an event count) */ ++ev1; if (sb->disks[rdev->desc_nr].state & ( (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) set_bit(Faulty, &rdev->flags); else if (desc->state & (1<<MD_DISK_SYNC)) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; rdev->saved_raid_disk = desc->raid_disk; } else if (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where * that is, so set to zero for now */ if (mddev->minor_version >= 91) { rdev->recovery_offset = 0; rdev->raid_disk = desc->raid_disk; } } if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); if (desc->state & (1<<MD_DISK_FAILFAST)) set_bit(FailFast, &rdev->flags); return 0; } /* * sync_super for 0.90.0 */ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) { mdp_super_t *sb; struct md_rdev *rdev2; int next_spare = mddev->raid_disks; /* make rdev->sb match mddev data.. * * 1/ zero out disks * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); * 3/ any empty disks < next_spare become removed * * disks[0] gets initialised to REMOVED because * we cannot be sure from other fields if it has * been initialised or not. */ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; rdev->sb_size = MD_SB_BYTES; sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); sb->md_magic = MD_SB_MAGIC; sb->major_version = mddev->major_version; sb->patch_version = mddev->patch_version; sb->gvalid_words = 0; /* ignored */ memcpy(&sb->set_uuid0, mddev->uuid+0, 4); memcpy(&sb->set_uuid1, mddev->uuid+4, 4); memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; if (mddev->reshape_position == MaxSector) sb->minor_version = 90; else { sb->minor_version = 91; sb->reshape_position = mddev->reshape_position; sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) { sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); if (rdev2->raid_disk >= 0 && sb->minor_version >= 91) /* we have nowhere to store the recovery_offset, * but if it is not below the reshape_position, * we can piggy-back on that. */ is_active = 1; if (rdev2->raid_disk < 0 || test_bit(Faulty, &rdev2->flags)) is_active = 0; if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; rdev2->desc_nr = desc_nr; d = &sb->disks[rdev2->desc_nr]; nr_disks++; d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<<MD_DISK_FAULTY); else if (is_active) { d->state = (1<<MD_DISK_ACTIVE); if (test_bit(In_sync, &rdev2->flags)) d->state |= (1<<MD_DISK_SYNC); active++; working++; } else { d->state = 0; spare++; working++; } if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<<MD_DISK_WRITEMOSTLY); if (test_bit(FailFast, &rdev2->flags)) d->state |= (1<<MD_DISK_FAILFAST); } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { mdp_disk_t *d = &sb->disks[i]; if (d->state == 0 && d->number == 0) { d->number = i; d->raid_disk = i; d->state = (1<<MD_DISK_REMOVED); d->state |= (1<<MD_DISK_FAULTY); failed++; } } sb->nr_disks = nr_disks; sb->active_disks = active; sb->working_disks = working; sb->failed_disks = failed; sb->spare_disks = spare; sb->this_disk = sb->disks[rdev->desc_nr]; sb->sb_csum = calc_sb_csum(sb); } /* * rdev_size_change for 0.90.0 */ static unsigned long long super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { md_write_metadata(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } static int super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { /* non-zero offset changes not possible with v0.90 */ return new_offset == 0; } /* * version 1 superblock */ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) { __le32 disk_csum; u32 csum; unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) newcsum += le16_to_cpu(*(__le16*) isuper); csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; return cpu_to_le32(csum); } static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; sector_t sectors; int bmask; bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. * It is always aligned to a 4K boundary and * depeding on minor_version, it can be: * 0: At least 8K, but less than 12K, from end of device * 1: At start of device * 2: 4K from start of device. */ switch(minor_version) { case 0: sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; sb_start &= ~(sector_t)(4*2-1); break; case 1: sb_start = 0; break; case 2: sb_start = 8; break; default: return -EINVAL; } rdev->sb_start = sb_start; /* superblock is rarely larger than 1K, but it can be larger, * and it is safe to read 4k, so we do that */ ret = read_disk_sb(rdev, 4096); if (ret) return ret; sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || le64_to_cpu(sb->super_offset) != rdev->sb_start || (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); return -EINVAL; } if (le64_to_cpu(sb->data_size) < 10) { pr_warn("md: data_size too small on %pg\n", rdev->bdev); return -EINVAL; } if (sb->pad0 || sb->pad3[0] || memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) /* Some padding is non-zero, might be a new feature */ return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); rdev->new_data_offset = rdev->data_offset; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; if (minor_version && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; rdev->desc_nr = le32_to_cpu(sb->dev_number); if (!rdev->bb_page) { rdev->bb_page = alloc_page(GFP_KERNEL); if (!rdev->bb_page) return -ENOMEM; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && rdev->badblocks.count == 0) { /* need to load the bad block list. * Currently we limit it to one page. */ s32 offset; sector_t bb_sector; __le64 *bbp; int i; int sectors = le16_to_cpu(sb->bblog_size); if (sectors > (PAGE_SIZE / 512)) return -EINVAL; offset = le32_to_cpu(sb->bblog_offset); if (offset == 0) return -EINVAL; bb_sector = (long long)offset; if (!sync_page_io(rdev, bb_sector, sectors << 9, rdev->bb_page, REQ_OP_READ, true)) return -EIO; bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { u64 bb = le64_to_cpu(*bbp); int count = bb & (0x3ff); u64 sector = bb >> 10; sector <<= sb->bblog_shift; count <<= sb->bblog_shift; if (bb + 1 == 0) break; if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; if ((le32_to_cpu(sb->feature_map) & (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); rdev->ppl.size = le16_to_cpu(sb->ppl.size); rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && sb->level != 0) return -EINVAL; /* not spare disk */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) spare_disk = false; if (!refdev) { if (!spare_disk) ret = 1; else ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || sb->layout != refsb->layout || sb->chunksize != refsb->chunksize) { pr_warn("md: %pg has strangely different superblock to %pg\n", rdev->bdev, refdev->bdev); return -EINVAL; } ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; } if (minor_version) sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; else sectors = rdev->sb_start; if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); return ret; } static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); int role; rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime); mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; /* Default location for bitmap is 1K after superblock * using 3K - total of 4K */ mddev->bitmap_info.default_offset = 1024 >> 9; mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_info.file == NULL) { mddev->bitmap_info.offset = (__s32)le32_to_cpu(sb->bitmap_offset); /* Metadata doesn't record how much space is available. * For 1.0, we assume we can use up to the superblock * if before, else to 4K beyond superblock. * For others, assume no change is possible. */ if (mddev->minor_version > 0) mddev->bitmap_info.space = 0; else if (mddev->bitmap_info.offset > 0) mddev->bitmap_info.space = 8 - mddev->bitmap_info.offset; else mddev->bitmap_info.space = -mddev->bitmap_info.offset; } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); if (mddev->delta_disks < 0 || (mddev->delta_disks == 0 && (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_BACKWARDS))) mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } if (mddev->level == 0 && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) mddev->layout = -1; if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) & (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && (le32_to_cpu(sb->feature_map) & MD_FEATURE_MULTIPLE_PPLS)) return -EINVAL; set_bit(MD_HAS_PPL, &mddev->flags); } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count). * Similar to mdadm, we allow event counter difference of 1 * from the freshest device. */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events) set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { /* * If we are assembling, and our event counter is smaller than the * highest event counter, we cannot trust our superblock about the role. * It could happen that our rdev was marked as Faulty, and all other * superblocks were updated with +1 event counter. * Then, before the next superblock update, which typically happens when * remove_and_add_spares() removes the device from the array, there was * a crash or reboot. * If we allow current rdev without consulting the freshest superblock, * we could cause data corruption. * Note that in this case our event counter is smaller by 1 than the * highest, otherwise, this rdev would not be allowed into array; * both kernel and mdadm allow event counter difference of 1. */ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); if (rdev->desc_nr >= freshest_max_dev) { /* this is unexpected, better not proceed */ pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", mdname(mddev), rdev->bdev, rdev->desc_nr, freshest->bdev, freshest_max_dev); return -EUCLEAN; } role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", mdname(mddev), rdev->bdev, role, role, freshest->bdev); } else { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); } switch (role) { case MD_DISK_ROLE_SPARE: /* spare */ break; case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; case MD_DISK_ROLE_JOURNAL: /* journal device */ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { /* journal device without journal feature */ pr_warn("md: journal device provided without journal feature, ignoring the device\n"); return -EINVAL; } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); rdev->raid_disk = 0; break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_BITMAP)) rdev->saved_raid_disk = -1; } else { /* * If the array is FROZEN, then the device can't * be in_sync with rest of array. */ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) set_bit(In_sync, &rdev->flags); } rdev->raid_disk = role; break; } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); if (sb->devflags & FailFast1) set_bit(FailFast, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); return 0; } static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb; struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */ sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->resync_offset); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else sb->devflags &= ~FailFast1; if (test_bit(WriteMostly, &rdev->flags)) sb->devflags |= WriteMostly1; else sb->devflags &= ~WriteMostly1; sb->data_offset = cpu_to_le64(rdev->data_offset); sb->data_size = cpu_to_le64(rdev->sectors); if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); if (rdev->saved_raid_disk >= 0 && mddev->bitmap) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } /* Note: recovery_offset and journal_tail share space */ if (test_bit(Journal, &rdev->flags)) sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->reshape_position = cpu_to_le64(mddev->reshape_position); sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); if (mddev->delta_disks == 0 && mddev->reshape_backwards) sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); if (rdev->new_data_offset != rdev->data_offset) { sb->feature_map |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset - rdev->data_offset)); } } if (mddev_is_clustered(mddev)) sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) /* Cannot record bad blocks on this device */ md_error(mddev, rdev); else { struct badblocks *bb = &rdev->badblocks; __le64 *bbp = (__le64 *)page_address(rdev->bb_page); u64 *p = bb->page; sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); if (bb->changed) { unsigned seq; retry: seq = read_seqbegin(&bb->lock); memset(bbp, 0xff, PAGE_SIZE); for (i = 0 ; i < bb->count ; i++) { u64 internal_bb = p[i]; u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | BB_LEN(internal_bb)); bbp[i] = cpu_to_le64(store_bb); } bb->changed = 0; if (read_seqretry(&bb->lock, seq)) goto retry; bb->sector = (rdev->sb_start + (int)le32_to_cpu(sb->bblog_offset)); bb->size = le16_to_cpu(sb->bblog_size); } } max_dev = 0; rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; if (max_dev > le32_to_cpu(sb->max_dev)) { int bmask; sb->max_dev = cpu_to_le32(max_dev); rdev->sb_size = max_dev * 2 + 256; bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; } else max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); if (test_bit(MD_HAS_PPL, &mddev->flags)) { if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); else sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); sb->ppl.size = cpu_to_le16(rdev->ppl.size); } rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else if (test_bit(Journal, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); } static sector_t super_1_choose_bm_space(sector_t dev_size) { sector_t bm_space; /* if the device is bigger than 8Gig, save 64k for bitmap * usage, if bigger than 200Gig, save 128k */ if (dev_size < 64*2) bm_space = 0; else if (dev_size - 64*2 >= 200*1024*1024*2) bm_space = 128*2; else if (dev_size - 4*2 > 8*1024*1024*2) bm_space = 64*2; else bm_space = 4*2; return bm_space; } static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->data_offset != rdev->new_data_offset) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { /* minor version 0; superblock after data */ sector_t sb_start, bm_space; sector_t dev_size = bdev_nr_sectors(rdev->bdev); /* 8K is for superblock */ sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); bm_space = super_1_choose_bm_space(dev_size); /* Space that can be used to store date needs to decrease * superblock bitmap space and bad block space(4K) */ max_sectors = sb_start - bm_space - 4*2; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_write_metadata(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { struct mddev *mddev = rdev->mddev; /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on * any metadata, so stay: * 36K beyond start of superblock * beyond end of badblocks * beyond write-intent bitmap */ if (rdev->sb_start + (32+4)*2 > new_offset) return 0; if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (!err && rdev->sb_start + mddev->bitmap_info.offset + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) return 0; } if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) return 0; return 1; } static struct super_type super_types[] = { [0] = { .name = "0.90.0", .owner = THIS_MODULE, .load_super = super_90_load, .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", .owner = THIS_MODULE, .load_super = super_1_load, .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, .allow_new_offset = super_1_allow_new_offset, }, }; static void sync_super(struct mddev *mddev, struct md_rdev *rdev) { if (mddev->sync_super) { mddev->sync_super(mddev, rdev); return; } BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); super_types[mddev->major_version].sync_super(mddev, rdev); } static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) { struct md_rdev *rdev, *rdev2; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev1) { if (test_bit(Faulty, &rdev->flags) || test_bit(Journal, &rdev->flags) || rdev->raid_disk == -1) continue; rdev_for_each_rcu(rdev2, mddev2) { if (test_bit(Faulty, &rdev2->flags) || test_bit(Journal, &rdev2->flags) || rdev2->raid_disk == -1) continue; if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static LIST_HEAD(pending_raid_disks); /* * Try to register data integrity profile for an mddev * * This is called when an array is started and after a disk has been kicked * from the array. It only succeeds if all working and active component devices * are integrity capable with matching profiles. */ int md_integrity_register(struct mddev *mddev) { if (list_empty(&mddev->disks)) return 0; /* nothing to do */ if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); return 0; } EXPORT_SYMBOL(md_integrity_register); static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); } static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; int err; /* prevent duplicates */ if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; if (rdev_read_only(rdev) && mddev->pers) return -EROFS; /* make sure rdev->sectors exceeds mddev->dev_sectors */ if (!test_bit(Journal, &rdev->flags) && rdev->sectors && (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care * about aligning sizes (e.g. linear) */ if (mddev->level > 0) return -ENOSPC; } else mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. * If it is -1, assign a free number, else * check number is not in use */ rcu_read_lock(); if (rdev->desc_nr < 0) { int choice = 0; if (mddev->pers) choice = mddev->raid_disks; while (md_find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { rcu_read_unlock(); return -EBUSY; } } rcu_read_unlock(); if (!test_bit(Journal, &rdev->flags) && mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { pr_warn("md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; } snprintf(b, sizeof(b), "%pg", rdev->bdev); strreplace(b, '/', '!'); rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; /* failure here is OK */ err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); rdev->sysfs_unack_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); rdev->sysfs_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); /* May as well allow recovery to be retried once */ mddev->recovery_disabled++; return 0; fail: pr_warn("md: failed to register dev-%s for %s\n", b, mdname(mddev)); mddev_destroy_serial_pool(mddev, rdev); return err; } void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ static struct md_rdev claim_rdev; static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif fput(rdev->bdev_file); rdev->bdev = NULL; kobject_put(&rdev->kobj); } static void md_kick_rdev_from_array(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); mddev_destroy_serial_pool(rdev->mddev, rdev); WRITE_ONCE(rdev->mddev, NULL); sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_unack_badblocks); sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; synchronize_rcu(); /* * kobject_del() will wait for all in progress writers to be done, where * reconfig_mutex is held, hence it can't be called under * reconfig_mutex and it's delayed to mddev_unlock(). */ list_add(&rdev->same_set, &mddev->deleting); } static void export_array(struct mddev *mddev) { struct md_rdev *rdev; while (!list_empty(&mddev->disks)) { rdev = list_first_entry(&mddev->disks, struct md_rdev, same_set); md_kick_rdev_from_array(rdev); } mddev->raid_disks = 0; mddev->major_version = 0; } static bool set_in_sync(struct mddev *mddev) { lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); spin_lock(&mddev->lock); if (!mddev->in_sync && percpu_ref_is_zero(&mddev->writes_pending)) { mddev->in_sync = 1; /* * Ensure ->in_sync is visible before we clear * ->sync_checkers. */ smp_mb(); set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_state); } if (--mddev->sync_checkers == 0) percpu_ref_switch_to_percpu(&mddev->writes_pending); } if (mddev->safemode == 1) mddev->safemode = 0; return mddev->in_sync; } static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already * have the right event counter, or have one earlier * (which would mean they aren't being marked as dirty * with the rest of the array) */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; } else { sync_super(mddev, rdev); rdev->sb_loaded = 1; } } } static bool does_sb_need_changing(struct mddev *mddev) { struct md_rdev *rdev = NULL, *iter; struct mdp_superblock_1 *sb; int role; /* Find a good rdev */ rdev_for_each(iter, mddev) if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { rdev = iter; break; } /* No good device found. */ if (!rdev) return false; sb = page_address(rdev->sb_page); /* Check if a device has become faulty or a spare become active */ rdev_for_each(rdev, mddev) { role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); /* Device activated? */ if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) return true; /* Device turned faulty? */ if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) return true; } /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; return false; } void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; int ret = -1; if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return; } repeat: if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); return; } } /* * First make sure individual recovery_offsets are correct * curr_resync_completed can only be used during recovery. * During reshape/resync it might use array-addresses rather * that device addresses. */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; } if (!mddev->persistent) { clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->external) { clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } wake_up(&mddev->sb_wait); return; } spin_lock(&mddev->lock); mddev->utime = ktime_get_real_seconds(); if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) /* just a clean<-> dirty transition, possibly leave spares alone, * though if events isn't the right even/odd, we will have to do * spares after all */ nospares = 1; if (force_change) nospares = 0; if (mddev->degraded) /* If the array is degraded, then skipping spares is both * dangerous and fairly pointless. * Dangerous because a device that was removed from the array * might have a event_count that still looks up-to-date, * so it can be re-added without a resync. * Pointless because if there are any spares to skip, * then a recovery will happen and soon that array won't * be degraded any more and the spare can go back to sleep then. */ nospares = 0; sync_req = mddev->in_sync; /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; mddev->can_decrease_events = 0; } else { /* otherwise we have to go forward and ... */ mddev->events ++; mddev->can_decrease_events = nospares; } /* * This 64-bit counter should never wrap. * Either we are in around ~1 trillion A.C., assuming * 1 reboot per second, or we have a bug... */ WARN_ON(mddev->events == 0); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags)) set_bit(FaultRecorded, &rdev->flags); } sync_sbs(mddev, nospares); spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { md_write_metadata(mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { md_write_metadata(mddev, rdev, rdev->badblocks.sector, rdev->badblocks.size << 9, rdev->bb_page, 0); rdev->badblocks.size = 0; } } else pr_debug("md: %pg (skipping faulty)\n", rdev->bdev); } if (md_super_wait(mddev) < 0) goto rewrite; /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) /* have to write it out again */ goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } } EXPORT_SYMBOL(md_update_sb); static int add_bound_rdev(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; int err = 0; bool add_journal = test_bit(Journal, &rdev->flags); if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately. */ super_types[mddev->major_version]. validate_super(mddev, NULL/*freshest*/, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); if (err) { md_kick_rdev_from_array(rdev); return err; } } sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_new_event(); return 0; } /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. */ static int cmd_match(const char *cmd, const char *str) { /* See if cmd, written into a sysfs file, matches * str. They must either be the same, or cmd can * have a trailing newline */ while (*cmd && *str && *cmd == *str) { cmd++; str++; } if (*cmd == '\n') cmd++; if (*str || *cmd) return 0; return 1; } struct rdev_sysfs_entry { struct attribute attr; ssize_t (*show)(struct md_rdev *, char *); ssize_t (*store)(struct md_rdev *, const char *, size_t); }; static ssize_t state_show(struct md_rdev *rdev, char *page) { char *sep = ","; size_t len = 0; unsigned long flags = READ_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || (!test_bit(ExternalBbl, &flags) && rdev->badblocks.unacked_exist)) len += sprintf(page+len, "faulty%s", sep); if (test_bit(In_sync, &flags)) len += sprintf(page+len, "in_sync%s", sep); if (test_bit(Journal, &flags)) len += sprintf(page+len, "journal%s", sep); if (test_bit(WriteMostly, &flags)) len += sprintf(page+len, "write_mostly%s", sep); if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist && !test_bit(Faulty, &flags))) len += sprintf(page+len, "blocked%s", sep); if (!test_bit(Faulty, &flags) && !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) len += sprintf(page+len, "spare%s", sep); if (test_bit(WriteErrorSeen, &flags)) len += sprintf(page+len, "write_error%s", sep); if (test_bit(WantReplacement, &flags)) len += sprintf(page+len, "want_replacement%s", sep); if (test_bit(Replacement, &flags)) len += sprintf(page+len, "replacement%s", sep); if (test_bit(ExternalBbl, &flags)) len += sprintf(page+len, "external_bbl%s", sep); if (test_bit(FailFast, &flags)) len += sprintf(page+len, "failfast%s", sep); if (len) len -= strlen(sep); return len+sprintf(page+len, "\n"); } static ssize_t state_store(struct md_rdev *rdev, const char *buf, size_t len) { /* can write * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active * -insync - clear Insync for a device with a slot assigned, * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen * {,-}failfast - set/clear FailFast */ struct mddev *mddev = rdev->mddev; int err = -EINVAL; bool need_update_sb = false; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); if (test_bit(MD_BROKEN, &rdev->mddev->flags)) err = -EBUSY; else err = 0; } else if (cmd_match(buf, "remove")) { if (rdev->mddev->pers) { clear_bit(Blocked, &rdev->flags); remove_and_add_spares(rdev->mddev, rdev); } if (rdev->raid_disk >= 0) err = -EBUSY; else { err = 0; if (mddev_is_clustered(mddev)) err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); if (mddev->pers) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_new_event(); } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "blocked")) { set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { if (!test_bit(Faulty, &rdev->flags) && !test_bit(ExternalBbl, &rdev->flags) && rdev->badblocks.unacked_exist) { /* metadata handler doesn't understand badblocks, * so we need to fail the device */ md_error(rdev->mddev, rdev); } clear_bit(Blocked, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); err = 0; } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; } else if (cmd_match(buf, "failfast")) { set_bit(FailFast, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-failfast")) { clear_bit(FailFast, &rdev->flags); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; rdev->raid_disk = -1; err = 0; } } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; } else if (cmd_match(buf, "-write_error")) { clear_bit(WriteErrorSeen, &rdev->flags); err = 0; } else if (cmd_match(buf, "want_replacement")) { /* Any non-spare device that is not a replacement can * become want_replacement at any time, but we then need to * check if recovery is needed. */ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); err = 0; } else if (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. * Once replacements starts it is too late though. */ err = 0; clear_bit(WantReplacement, &rdev->flags); } else if (cmd_match(buf, "replacement")) { /* Can only set a device as a replacement when array has not * yet been started. Once running, replacement is automatic * from spares, or by assigning 'slot'. */ if (rdev->mddev->pers) err = -EBUSY; else { set_bit(Replacement, &rdev->flags); err = 0; } } else if (cmd_match(buf, "-replacement")) { /* Similarly, can only clear Replacement before start */ if (rdev->mddev->pers) err = -EBUSY; else { clear_bit(Replacement, &rdev->flags); err = 0; } } else if (cmd_match(buf, "re-add")) { if (!rdev->mddev->pers) err = -EINVAL; else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they * will land in the local bitmap, which will be synced * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } } else err = -EBUSY; } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { set_bit(ExternalBbl, &rdev->flags); rdev->badblocks.shift = 0; err = 0; } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { clear_bit(ExternalBbl, &rdev->flags); err = 0; } if (need_update_sb) md_update_sb(mddev, 1); if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t errors_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); } static ssize_t errors_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned int n; int rv; rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; atomic_set(&rdev->corrected_errors, n); return len; } static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { if (test_bit(Journal, &rdev->flags)) return sprintf(page, "journal\n"); else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); } static ssize_t slot_store(struct md_rdev *rdev, const char *buf, size_t len) { int slot; int err; if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { err = kstrtouint(buf, 10, (unsigned int *)&slot); if (err < 0) return err; if (slot < 0) /* overflow */ return -ENOSPC; } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating * with the personality with ->hot_*_disk. * For now we only support removing * failed/spare devices. This normally happens automatically, * but not when the metadata is externally managed. */ if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(rdev->mddev, rdev); if (rdev->raid_disk >= 0) return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); } else if (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. */ int err; if (rdev->raid_disk != -1) return -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) return -EBUSY; if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); /* failure here is OK */; sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); } static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long offset; if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; rdev->data_offset = offset; rdev->new_data_offset = offset; return len; } static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); static ssize_t new_offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->new_data_offset); } static ssize_t new_offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long new_offset; struct mddev *mddev = rdev->mddev; if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */ ; else if (new_offset > rdev->data_offset) { /* must not push array size beyond rdev_sectors */ if (new_offset - rdev->data_offset + mddev->dev_sectors > rdev->sectors) return -E2BIG; } /* Metadata worries about other space details. */ /* decreasing the offset is inconsistent with a backwards * reshape. */ if (new_offset < rdev->data_offset && mddev->reshape_backwards) return -EINVAL; /* Increasing offset is inconsistent with forwards * reshape. reshape_direction should be set to * 'backwards' first. */ if (new_offset > rdev->data_offset && !mddev->reshape_backwards) return -EINVAL; if (mddev->pers && mddev->persistent && !super_types[mddev->major_version] .allow_new_offset(rdev, new_offset)) return -E2BIG; rdev->new_data_offset = new_offset; if (new_offset > rdev->data_offset) mddev->reshape_backwards = 1; else if (new_offset < rdev->data_offset) mddev->reshape_backwards = 0; return len; } static struct rdev_sysfs_entry rdev_new_offset = __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); static ssize_t rdev_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ if (a->data_offset + a->sectors <= b->data_offset) return false; if (b->data_offset + b->sectors <= a->data_offset) return false; return true; } static bool md_rdev_overlaps(struct md_rdev *rdev) { struct mddev *mddev; struct md_rdev *rdev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (test_bit(MD_DELETED, &mddev->flags)) continue; rdev_for_each(rdev2, mddev) { if (rdev != rdev2 && rdev->bdev == rdev2->bdev && md_rdevs_overlap(rdev, rdev2)) { spin_unlock(&all_mddevs_lock); return true; } } } spin_unlock(&all_mddevs_lock); return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) { unsigned long long blocks; sector_t new; if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) return -EINVAL; /* sector conversion overflow */ new = blocks * 2; if (new != blocks * 2) return -EINVAL; /* unsigned long long to sector_t overflow */ *sectors = new; return 0; } static ssize_t rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) { struct mddev *my_mddev = rdev->mddev; sector_t oldsectors = rdev->sectors; sector_t sectors; if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strict_blocks_to_sectors(buf, &sectors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. rdev_size_change(rdev, sectors); if (!sectors) return -EBUSY; } else if (!sectors) sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!my_mddev->pers->resize) /* Cannot change size for RAID0 or Linear etc */ return -EINVAL; } if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; /* * Check that all other rdevs with the same bdev do not overlap. This * check does not provide a hard guarantee, it just helps avoid * dangerous mistakes. */ if (sectors > oldsectors && my_mddev->external && md_rdev_overlaps(rdev)) { /* * Someone else could have slipped in a size change here, but * doing so is just silly. We put oldsectors back because we * know it is safe, and trust userspace not to race with itself. */ rdev->sectors = oldsectors; return -EBUSY; } return len; } static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) { unsigned long long recovery_start = rdev->recovery_offset; if (test_bit(In_sync, &rdev->flags) || recovery_start == MaxSector) return sprintf(page, "none\n"); return sprintf(page, "%llu\n", recovery_start); } static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long recovery_start; if (cmd_match(buf, "none")) recovery_start = MaxSector; else if (kstrtoull(buf, 10, &recovery_start)) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; rdev->recovery_offset = recovery_start; if (recovery_start == MaxSector) set_bit(In_sync, &rdev->flags); else clear_bit(In_sync, &rdev->flags); return len; } static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); /* sysfs access to bad-blocks list. * We present two files. * 'bad-blocks' lists sector numbers and lengths of ranges that * are recorded as bad. The list is truncated to fit within * the one-page limit of sysfs. * Writing "sector length" to this file adds an acknowledged * bad block list. * 'unacknowledged-bad-blocks' lists bad blocks that have not yet * been acknowledged. Writing to this file adds bad blocks * without acknowledging them. This is largely for testing. */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); } static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) { int rv = badblocks_store(&rdev->badblocks, page, len, 0); /* Maybe that ack was all we needed */ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) wake_up(&rdev->blocked_wait); return rv; } static struct rdev_sysfs_entry rdev_bad_blocks = __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); static ssize_t ubb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 1); } static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) { return badblocks_store(&rdev->badblocks, page, len, 1); } static struct rdev_sysfs_entry rdev_unack_bad_blocks = __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); static ssize_t ppl_sector_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); } static ssize_t ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long sector; if (kstrtoull(buf, 10, &sector) < 0) return -EINVAL; if (sector != (sector_t)sector) return -EINVAL; if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && rdev->raid_disk >= 0) return -EBUSY; if (rdev->mddev->persistent) { if (rdev->mddev->major_version == 0) return -EINVAL; if ((sector > rdev->sb_start && sector - rdev->sb_start > S16_MAX) || (sector < rdev->sb_start && rdev->sb_start - sector > -S16_MIN)) return -EINVAL; rdev->ppl.offset = sector - rdev->sb_start; } else if (!rdev->mddev->external) { return -EBUSY; } rdev->ppl.sector = sector; return len; } static struct rdev_sysfs_entry rdev_ppl_sector = __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); static ssize_t ppl_size_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%u\n", rdev->ppl.size); } static ssize_t ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned int size; if (kstrtouint(buf, 10, &size) < 0) return -EINVAL; if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && rdev->raid_disk >= 0) return -EBUSY; if (rdev->mddev->persistent) { if (rdev->mddev->major_version == 0) return -EINVAL; if (size > U16_MAX) return -EINVAL; } else if (!rdev->mddev->external) { return -EBUSY; } rdev->ppl.size = size; return len; } static struct rdev_sysfs_entry rdev_ppl_size = __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, &rdev_new_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, &rdev_bad_blocks.attr, &rdev_unack_bad_blocks.attr, &rdev_ppl_sector.attr, &rdev_ppl_size.attr, NULL, }; ATTRIBUTE_GROUPS(rdev_default); static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); if (!entry->show) return -EIO; if (!rdev->mddev) return -ENODEV; return entry->show(rdev, page); } static ssize_t rdev_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; bool suspend = false; ssize_t rv; struct mddev *mddev = READ_ONCE(rdev->mddev); if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!mddev) return -ENODEV; if (entry->store == state_store) { if (cmd_match(page, "remove")) kn = sysfs_break_active_protection(kobj, attr); if (cmd_match(page, "remove") || cmd_match(page, "re-add") || cmd_match(page, "writemostly") || cmd_match(page, "-writemostly")) suspend = true; } rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) sysfs_unbreak_active_protection(kn); return rv; } static void rdev_free(struct kobject *ko) { struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); kfree(rdev); } static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; static const struct kobj_type rdev_ktype = { .release = rdev_free, .sysfs_ops = &rdev_sysfs_ops, .default_groups = rdev_default_groups, }; int md_rdev_init(struct md_rdev *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); /* Add space to store bad block list. * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * * mark the device faulty if: * * - the device is nonexistent (zero size) * - the device has no valid superblock * * a faulty rdev _never_ has rdev->sb set. */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { struct md_rdev *rdev; sector_t size; int err; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) return ERR_PTR(-ENOMEM); err = md_rdev_init(rdev); if (err) goto out_free_rdev; err = alloc_disk_sb(rdev); if (err) goto out_clear_rdev; rdev->bdev_file = bdev_file_open_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev_file)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); err = PTR_ERR(rdev->bdev_file); goto out_clear_rdev; } rdev->bdev = file_bdev(rdev->bdev_file); kobject_init(&rdev->kobj, &rdev_ktype); size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) { pr_warn("md: %pg has zero or unknown size, marking faulty!\n", rdev->bdev); err = -EINVAL; goto out_blkdev_put; } if (super_format >= 0) { err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", rdev->bdev, super_format, super_minor); goto out_blkdev_put; } if (err < 0) { pr_warn("md: could not read %pg's sb, not importing!\n", rdev->bdev); goto out_blkdev_put; } } return rdev; out_blkdev_put: fput(rdev->bdev_file); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: kfree(rdev); return ERR_PTR(err); } /* * Check a full RAID array for plausibility */ static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: freshest = rdev; break; case 0: break; default: pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", rdev->bdev); md_kick_rdev_from_array(rdev); } /* Cannot find a valid fresh disk */ if (!freshest) { pr_warn("md: cannot find a valid disk\n"); return -EINVAL; } super_types[mddev->major_version]. validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { pr_warn("md: %s: %pg: only %d devices permitted\n", mdname(mddev), rdev->bdev, mddev->max_disks); md_kick_rdev_from_array(rdev); continue; } if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, freshest, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); continue; } } if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } return 0; } /* Read a fixed-point number. * Numbers in sysfs attributes should be in "standard" units where * possible, so time should be in seconds. * However we internally use a a much smaller unit such as * milliseconds or jiffies. * This function takes a decimal number with a possible fractional * component, and produces an integer which is the result of * multiplying that number by 10^'scale'. * all without any floating-point arithmetic. */ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) { unsigned long result = 0; long decimals = -1; while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { if (*cp == '.') decimals = 0; else if (decimals < scale) { unsigned int value; value = *cp - '0'; result = result * 10 + value; if (decimals >= 0) decimals++; } cp++; } if (*cp == '\n') cp++; if (*cp) return -EINVAL; if (decimals < 0) decimals = 0; *res = result * int_pow(10, scale - decimals); return 0; } static ssize_t safe_delay_show(struct mddev *mddev, char *page) { unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; if (mddev_is_clustered(mddev)) { pr_warn("md: Safemode is disabled for clustered mode\n"); return -EINVAL; } if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; unsigned long new_delay = (msec*HZ)/1000; if (new_delay == 0) new_delay = 1; mddev->safemode_delay = new_delay; if (new_delay < old_delay || old_delay == 0) mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } static struct md_sysfs_entry md_safe_delay = __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { struct md_personality *p; int ret; spin_lock(&mddev->lock); p = mddev->pers; if (p) ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) ret = sprintf(page, "%d\n", mddev->level); else ret = 0; spin_unlock(&mddev->lock); return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; ssize_t rv; size_t slen = len; struct md_personality *pers, *oldpers; long level; void *priv, *oldpriv; struct md_rdev *rdev; if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; rv = mddev_suspend_and_lock(mddev); if (rv) return rv; if (mddev->pers == NULL) { memcpy(mddev->clevel, buf, slen); if (mddev->clevel[slen-1] == '\n') slen--; mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; rv = len; goto out_unlock; } rv = -EROFS; if (!md_is_rdwr(mddev)) goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape * - old personality can be suspended * - new personality will access other array. */ rv = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) goto out_unlock; rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->head.name); goto out_unlock; } /* Now find the new personality */ memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n') slen--; clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); pers = get_pers(level, clevel); if (!pers) { rv = -EINVAL; goto out_unlock; } if (pers == mddev->pers) { /* Nothing to do! */ put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; goto out_unlock; } rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ priv = pers->takeover(mddev); if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); goto out_unlock; } /* Looks like we have a winner */ mddev_detach(mddev); spin_lock(&mddev->lock); oldpers = mddev->pers; oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->degraded = 0; spin_unlock(&mddev->lock); if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed * externally so we need to be sure that writes * won't block due to a need to transition * clean->dirty * until external management is started. */ mddev->in_sync = 0; mddev->safemode_delay = 0; mddev->safemode = 0; } oldpers->free(mddev, oldpriv); if (oldpers->sync_request == NULL && pers->sync_request != NULL) { /* need to add the md_redundancy_group */ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } if (oldpers->sync_request != NULL && pers->sync_request == NULL) { /* need to remove the md_redundancy_group */ if (mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; } put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue; sysfs_unlink_rdev(mddev, rdev); } rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) continue; rdev->raid_disk = rdev->new_raid_disk; if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); else { if (sysfs_link_rdev(mddev, rdev)) pr_warn("md: cannot register rd%d for %s after level change\n", rdev->raid_disk, mdname(mddev)); } } if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ mddev->in_sync = 1; timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: mddev_unlock_and_resume(mddev); return rv; } static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t new_level_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->new_level); } static ssize_t new_level_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; mddev->new_level = n; md_update_sb(mddev, 1); mddev_unlock(mddev); return len; } static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); static ssize_t bitmap_type_show(struct mddev *mddev, char *page) { struct md_submodule_head *head; unsigned long i; ssize_t len = 0; if (mddev->bitmap_id == ID_BITMAP_NONE) len += sprintf(page + len, "[none] "); else len += sprintf(page + len, "none "); xa_lock(&md_submodule); xa_for_each(&md_submodule, i, head) { if (head->type != MD_BITMAP) continue; if (mddev->bitmap_id == head->id) len += sprintf(page + len, "[%s] ", head->name); else len += sprintf(page + len, "%s ", head->name); } xa_unlock(&md_submodule); len += sprintf(page + len, "\n"); return len; } static ssize_t bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) { struct md_submodule_head *head; enum md_submodule_id id; unsigned long i; int err = 0; xa_lock(&md_submodule); if (mddev->bitmap_ops) { err = -EBUSY; goto out; } if (cmd_match(buf, "none")) { mddev->bitmap_id = ID_BITMAP_NONE; goto out; } xa_for_each(&md_submodule, i, head) { if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { mddev->bitmap_id = head->id; goto out; } } err = kstrtoint(buf, 10, &id); if (err) goto out; if (id == ID_BITMAP_NONE) { mddev->bitmap_id = id; goto out; } head = xa_load(&md_submodule, id); if (head && head->type == MD_BITMAP) { mddev->bitmap_id = id; goto out; } err = -ENOENT; out: xa_unlock(&md_submodule); return err ? err : len; } static struct md_sysfs_entry md_bitmap_type = __ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ if (mddev->reshape_position != MaxSector && mddev->layout != mddev->new_layout) return sprintf(page, "%d (%d)\n", mddev->new_layout, mddev->layout); return sprintf(page, "%d\n", mddev->layout); } static ssize_t layout_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t raid_disks_show(struct mddev *mddev, char *page) { if (mddev->raid_disks == 0) return 0; if (mddev->reshape_position != MaxSector && mddev->delta_disks != 0) return sprintf(page, "%d (%d)\n", mddev->raid_disks, mddev->raid_disks - mddev->delta_disks); return sprintf(page, "%d\n", mddev->raid_disks); } static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int err; err = kstrtouint(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) goto out_unlock; } err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; out_unlock: mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t uuid_show(struct mddev *mddev, char *page) { return sprintf(page, "%pU\n", mddev->uuid); } static struct md_sysfs_entry md_uuid = __ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && mddev->chunk_sectors != mddev->new_chunk_sectors) return sprintf(page, "%d (%d)\n", mddev->new_chunk_sectors << 9, mddev->chunk_sectors << 9); return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long n; int err; err = kstrtoul(buf, 10, &n); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long n; int err; if (cmd_match(buf, "none")) n = MaxSector; else { err = kstrtoull(buf, 10, &n); if (err < 0) return err; if (n != (sector_t)n) return -EINVAL; } err = mddev_lock(mddev); if (err) return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) err = -EBUSY; if (!err) { mddev->resync_offset = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); /* * The array state can be: * * clear * No devices, no size, no level * Equivalent to STOP_ARRAY ioctl * inactive * May have some settings, but array is not active * all IO results in error * When written, doesn't tear down array, but just stops it * suspended (not supported yet) * All IO requests will block. The array can be reconfigured. * Writing this, if accepted, will block until array is quiescent * readonly * no resync can happen. no superblocks get written. * write requests fail * read-auto * like readonly, but behaves like 'clean' on a write request. * * clean - no pending writes, but otherwise active. * When written to inactive array, starts without resync * If a write request arrives then * if metadata is known, mark 'dirty' and switch to 'active'. * if not known, block and switch to write-pending * If written to an active array that has pending writes, then fails. * active * fully active: IO and resync can be happening. * When written to inactive array, starts with resync * * write-pending * clean, but writes are blocked waiting for 'active' to be written. * * active-idle * like active, but no writes have been seen for a while (100msec). * * broken * Array is failed. It's useful because mounted-arrays aren't stopped * when array is failed, so this state will at least alert the user that * something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, write_pending, active_idle, broken, bad_word}; static char *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", "write-pending", "active-idle", "broken", NULL }; static int match_word(const char *word, char **list) { int n; for (n=0; list[n]; n++) if (cmd_match(word, list[n])) break; return n; } static ssize_t array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { case MD_RDONLY: st = readonly; break; case MD_AUTO_READ: st = read_auto; break; case MD_RDWR: spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; else if (mddev->in_sync) st = clean; else if (mddev->safemode) st = active_idle; else st = active; spin_unlock(&mddev->lock); } if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) st = broken; } else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && mddev->dev_sectors == 0) st = clear; else st = inactive; } return sprintf(page, "%s\n", array_states[st]); } static int do_md_stop(struct mddev *mddev, int ro); static int md_set_readonly(struct mddev *mddev); static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { int err = 0; enum array_state st = match_word(buf, array_states); /* No lock dependent actions */ switch (st) { case suspended: /* not supported yet */ case write_pending: /* cannot be set */ case active_idle: /* cannot be set */ case broken: /* cannot be set */ case bad_word: return -EINVAL; case clear: case readonly: case inactive: case read_auto: if (!mddev->pers || !md_is_rdwr(mddev)) break; /* write sysfs will not open mddev and opener should be 0 */ err = mddev_set_closing_and_sync_blockdev(mddev, 0); if (err) return err; break; default: break; } if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between * clean and active */ spin_lock(&mddev->lock); if (st == active) { restart_array(mddev); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); } else /* st == clean */ { restart_array(mddev); if (!set_in_sync(mddev)) err = -EBUSY; } if (!err) sysfs_notify_dirent_safe(mddev->sysfs_state); spin_unlock(&mddev->lock); return err ?: len; } err = mddev_lock(mddev); if (err) return err; switch (st) { case inactive: /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2); break; case clear: err = do_md_stop(mddev, 0); break; case readonly: if (mddev->pers) err = md_set_readonly(mddev); else { mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: if (mddev->pers) { if (md_is_rdwr(mddev)) err = md_set_readonly(mddev); else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { mddev->ro = MD_AUTO_READ; set_disk_ro(mddev->gendisk, 0); } } else { mddev->ro = MD_AUTO_READ; err = do_md_run(mddev); } break; case clean: if (mddev->pers) { err = restart_array(mddev); if (err) break; spin_lock(&mddev->lock); if (!set_in_sync(mddev)) err = -EBUSY; spin_unlock(&mddev->lock); } else err = -EINVAL; break; case active: if (mddev->pers) { err = restart_array(mddev); if (err) break; clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); wake_up(&mddev->sb_wait); err = 0; } else { mddev->ro = MD_RDWR; set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } break; default: err = -EINVAL; break; } if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); } mddev_unlock(mddev); if (st == readonly || st == read_auto || st == inactive || (err && st == clear)) clear_bit(MD_CLOSING, &mddev->flags); return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t max_corrected_read_errors_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", atomic_read(&mddev->max_corr_read_errors)); } static ssize_t max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int n; int rv; rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; if (n > INT_MAX) return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } static struct md_sysfs_entry max_corr_read_errors = __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, max_corrected_read_errors_store); static ssize_t null_show(struct mddev *mddev, char *page) { return -EINVAL; } static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. * If the array has a persistent superblock, we read the * superblock to initialise info and check validity. * Otherwise, only checking done is that in bind_rdev_to_array, * which mainly checks size. */ char *e; int major = simple_strtoul(buf, &e, 10); int minor; dev_t dev; struct md_rdev *rdev; int err; if (!*buf || *e != ':' || !e[1] || e[1] == '\n') return -EINVAL; minor = simple_strtoul(e+1, &e, 10); if (*e && *e != '\n') return -EINVAL; dev = MKDEV(major, minor); if (major != MAJOR(dev) || minor != MINOR(dev)) return -EOVERFLOW; err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { struct md_rdev *rdev0 = list_entry(mddev->disks.next, struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) goto out; } } else if (mddev->external) rdev = md_import_device(dev, -2, -1); else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; } static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; int err; if (!md_bitmap_enabled(mddev, false)) return len; err = mddev_lock(mddev); if (err) return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ while (*buf) { chunk = end_chunk = simple_strtoul(buf, &end, 0); if (buf == end) break; if (*end == '-') { /* range */ buf = end + 1; end_chunk = simple_strtoul(buf, &end, 0); if (buf == end) break; } if (*end && !isspace(*end)) break; mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); buf = skip_spaces(end); } mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ out: mddev_unlock(mddev); return len; } static struct md_sysfs_entry md_bitmap = __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t size_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->dev_sectors / 2); } static int update_size(struct mddev *mddev, sector_t num_sectors); static ssize_t size_store(struct mddev *mddev, const char *buf, size_t len) { /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). * If array is active, we can try an on-line resize */ sector_t sectors; int err = strict_blocks_to_sectors(buf, &sectors); if (err < 0) return err; err = mddev_lock(mddev); if (err) return err; if (mddev->pers) { err = update_size(mddev, sectors); if (err == 0) md_update_sb(mddev, 1); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) mddev->dev_sectors = sectors; else err = -ENOSPC; } mddev_unlock(mddev); return err ? err : len; } static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t metadata_show(struct mddev *mddev, char *page) { if (mddev->persistent) return sprintf(page, "%d.%d\n", mddev->major_version, mddev->minor_version); else if (mddev->external) return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n"); } static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ err = mddev_lock(mddev); if (err) return err; err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) goto out_unlock; err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; if (namelen >= sizeof(mddev->metadata_type)) namelen = sizeof(mddev->metadata_type)-1; memcpy(mddev->metadata_type, buf+9, namelen); mddev->metadata_type[namelen] = 0; if (namelen && mddev->metadata_type[namelen-1] == '\n') mddev->metadata_type[--namelen] = 0; mddev->persistent = 0; mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; goto out_unlock; } major = simple_strtoul(buf, &e, 10); err = -EINVAL; if (e==buf || *e != '.') goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) goto out_unlock; err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; err = 0; out_unlock: mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) { return rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < sectors; } static enum sync_action md_get_active_sync_action(struct mddev *mddev) { struct md_rdev *rdev; bool is_recover = false; if (mddev->resync_offset < MaxSector) return ACTION_RESYNC; if (mddev->reshape_position != MaxSector) return ACTION_RESHAPE; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { if (rdev_needs_recovery(rdev, MaxSector)) { is_recover = true; break; } } rcu_read_unlock(); return is_recover ? ACTION_RECOVER : ACTION_IDLE; } enum sync_action md_sync_action(struct mddev *mddev) { unsigned long recovery = mddev->recovery; enum sync_action active_action; /* * frozen has the highest priority, means running sync_thread will be * stopped immediately, and no new sync_thread can start. */ if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return ACTION_FROZEN; /* * read-only array can't register sync_thread, and it can only * add/remove spares. */ if (!md_is_rdwr(mddev)) return ACTION_IDLE; /* * idle means no sync_thread is running, and no new sync_thread is * requested. */ if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && !test_bit(MD_RECOVERY_NEEDED, &recovery)) return ACTION_IDLE; /* * Check if any sync operation (resync/recover/reshape) is * currently active. This ensures that only one sync operation * can run at a time. Returns the type of active operation, or * ACTION_IDLE if none are active. */ active_action = md_get_active_sync_action(mddev); if (active_action != ACTION_IDLE) return active_action; if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return ACTION_RESHAPE; if (test_bit(MD_RECOVERY_RECOVER, &recovery)) return ACTION_RECOVER; if (test_bit(MD_RECOVERY_SYNC, &recovery)) { /* * MD_RECOVERY_CHECK must be paired with * MD_RECOVERY_REQUESTED. */ if (test_bit(MD_RECOVERY_CHECK, &recovery)) return ACTION_CHECK; if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) return ACTION_REPAIR; return ACTION_RESYNC; } /* * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no * sync_action is specified. */ return ACTION_IDLE; } enum sync_action md_sync_action_by_name(const char *page) { enum sync_action action; for (action = 0; action < NR_SYNC_ACTIONS; ++action) { if (cmd_match(page, action_name[action])) return action; } return NR_SYNC_ACTIONS; } const char *md_sync_action_name(enum sync_action action) { return action_name[action]; } static ssize_t action_show(struct mddev *mddev, char *page) { enum sync_action action = md_sync_action(mddev); return sprintf(page, "%s\n", md_sync_action_name(action)); } /** * stop_sync_thread() - wait for sync_thread to stop if it's running. * @mddev: the array. * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return. */ static void stop_sync_thread(struct mddev *mddev, bool locked) { int sync_seq = atomic_read(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked) mddev_unlock(mddev); return; } mddev_unlock(mddev); set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* * Thread might be blocked waiting for metadata update which will now * never happen */ md_wakeup_thread_directly(mddev->sync_thread); if (work_pending(&mddev->sync_work)) flush_work(&mddev->sync_work); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && sync_seq != atomic_read(&mddev->sync_seq))); if (locked) mddev_lock_nointr(mddev); } void md_idle_sync_thread(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_idle_sync_thread); void md_frozen_sync_thread(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_frozen_sync_thread); void md_unfrozen_sync_thread(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); } EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); static int mddev_start_reshape(struct mddev *mddev) { int ret; if (mddev->pers->start_reshape == NULL) return -EINVAL; if (mddev->reshape_position == MaxSector || mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ret = mddev->pers->start_reshape(mddev); if (ret) return ret; } else { /* * If reshape is still in progress, and md_check_recovery() can * continue to reshape, don't restart reshape because data can * be corrupted for raid456. */ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } sysfs_notify_dirent_safe(mddev->sysfs_degraded); return 0; } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { int ret; enum sync_action action; if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; retry: if (work_busy(&mddev->sync_work)) flush_work(&mddev->sync_work); ret = mddev_lock(mddev); if (ret) return ret; if (work_busy(&mddev->sync_work)) { mddev_unlock(mddev); goto retry; } action = md_sync_action_by_name(page); /* TODO: mdadm rely on "idle" to start sync_thread. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { switch (action) { case ACTION_FROZEN: md_frozen_sync_thread(mddev); ret = len; goto out; case ACTION_IDLE: md_idle_sync_thread(mddev); break; case ACTION_RESHAPE: case ACTION_RECOVER: case ACTION_CHECK: case ACTION_REPAIR: case ACTION_RESYNC: ret = -EBUSY; goto out; default: ret = -EINVAL; goto out; } } else { switch (action) { case ACTION_FROZEN: set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ret = len; goto out; case ACTION_RESHAPE: clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ret = mddev_start_reshape(mddev); if (ret) goto out; break; case ACTION_RECOVER: clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); break; case ACTION_CHECK: set_bit(MD_RECOVERY_CHECK, &mddev->recovery); fallthrough; case ACTION_REPAIR: set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); fallthrough; case ACTION_RESYNC: case ACTION_IDLE: clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); break; default: ret = -EINVAL; goto out; } } if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); ret = len; out: mddev_unlock(mddev); return ret; } static struct md_sysfs_entry md_scan_mode = __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { return sprintf(page, "%s\n", md_sync_action_name(mddev->last_sync_action)); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long) atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), mddev->sync_speed_min ? "local" : "system"); } static ssize_t sync_min_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int min; int rv; if (strncmp(buf, "system", 6) == 0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); if (rv < 0) return rv; if (min == 0) return -EINVAL; } mddev->sync_speed_min = min; return len; } static struct md_sysfs_entry md_sync_min = __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), mddev->sync_speed_max ? "local" : "system"); } static ssize_t sync_max_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int max; int rv; if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); if (rv < 0) return rv; if (max == 0) return -EINVAL; } mddev->sync_speed_max = max; return len; } static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t sync_io_depth_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), mddev->sync_io_depth ? "local" : "system"); } static ssize_t sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) { unsigned int max; int rv; if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); if (rv < 0) return rv; if (max == 0) return -EINVAL; } mddev->sync_io_depth = max; return len; } static struct md_sysfs_entry md_sync_io_depth = __ATTR_RW(sync_io_depth); static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); } static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t sync_force_parallel_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->parallel_resync); } static ssize_t sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) return -EINVAL; mddev->parallel_resync = n; if (mddev->sync_thread) wake_up(&resync_wait); return len; } /* force parallel resync, even with shared block devices */ static struct md_sysfs_entry md_sync_force_parallel = __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, sync_force_parallel_show, sync_force_parallel_store); static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; db = resync - mddev->resync_mark_cnt; return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ } static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t sync_completed_show(struct mddev *mddev, char *page) { unsigned long long max_sectors, resync; if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); if (mddev->curr_resync == MD_RESYNC_YIELDED || mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; resync = mddev->curr_resync_completed; return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); static ssize_t min_sync_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_min); } static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; int err; if (kstrtoull(buf, 10, &min)) return -EINVAL; spin_lock(&mddev->lock); err = -EINVAL; if (min > mddev->resync_max) goto out_unlock; err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; /* Round down to multiple of 4K for safety */ mddev->resync_min = round_down(min, 8); err = 0; out_unlock: spin_unlock(&mddev->lock); return err ?: len; } static struct md_sysfs_entry md_min_sync = __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); static ssize_t max_sync_show(struct mddev *mddev, char *page) { if (mddev->resync_max == MaxSector) return sprintf(page, "max\n"); else return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_max); } static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { int err; spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; int chunk; err = -EINVAL; if (kstrtoull(buf, 10, &max)) goto out_unlock; if (max < mddev->resync_min) goto out_unlock; err = -EBUSY; if (max < mddev->resync_max && md_is_rdwr(mddev) && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; /* Must be a multiple of chunk_size */ chunk = mddev->chunk_sectors; if (chunk) { sector_t temp = max; err = -EINVAL; if (sector_div(temp, chunk)) goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); err = 0; out_unlock: spin_unlock(&mddev->lock); return err ?: len; } static struct md_sysfs_entry md_max_sync = __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static ssize_t reshape_position_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector) return sprintf(page, "%llu\n", (unsigned long long)mddev->reshape_position); strcpy(page, "none\n"); return 5; } static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; unsigned long long new; int err; err = kstrtoull(buf, 10, &new); if (err < 0) return err; if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); if (err) return err; err = -EBUSY; if (mddev->pers) goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; err = 0; unlock: mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); static ssize_t reshape_direction_show(struct mddev *mddev, char *page) { return sprintf(page, "%s\n", mddev->reshape_backwards ? "backwards" : "forwards"); } static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; int err; if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) backwards = 1; else return -EINVAL; if (mddev->reshape_backwards == backwards) return len; err = mddev_lock(mddev); if (err) return err; /* check if we are allowed to change */ if (mddev->delta_disks) err = -EBUSY; else if (mddev->persistent && mddev->major_version == 0) err = -EINVAL; else mddev->reshape_backwards = backwards; mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_reshape_direction = __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, reshape_direction_store); static ssize_t array_size_show(struct mddev *mddev, char *page) { if (mddev->external_size) return sprintf(page, "%llu\n", (unsigned long long)mddev->array_sectors/2); else return sprintf(page, "default\n"); } static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; int err; err = mddev_lock(mddev); if (err) return err; /* cluster raid doesn't support change array_sectors */ if (mddev_is_clustered(mddev)) { mddev_unlock(mddev); return -EINVAL; } if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) sectors = mddev->pers->size(mddev, 0, 0); else sectors = mddev->array_sectors; mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, &sectors) < 0) err = -EINVAL; else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) err = -E2BIG; else mddev->external_size = 1; } if (!err) { mddev->array_sectors = sectors; if (mddev->pers) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } mddev_unlock(mddev); return err ?: len; } static struct md_sysfs_entry md_array_size = __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, array_size_store); static ssize_t consistency_policy_show(struct mddev *mddev, char *page) { int ret; if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { ret = sprintf(page, "journal\n"); } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { ret = sprintf(page, "ppl\n"); } else if (mddev->bitmap) { ret = sprintf(page, "bitmap\n"); } else if (mddev->pers) { if (mddev->pers->sync_request) ret = sprintf(page, "resync\n"); else ret = sprintf(page, "none\n"); } else { ret = sprintf(page, "unknown\n"); } return ret; } static ssize_t consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) { int err = 0; if (mddev->pers) { if (mddev->pers->change_consistency_policy) err = mddev->pers->change_consistency_policy(mddev, buf); else err = -EBUSY; } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { set_bit(MD_HAS_PPL, &mddev->flags); } else { err = -EINVAL; } return err ? err : len; } static struct md_sysfs_entry md_consistency_policy = __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, consistency_policy_store); static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->fail_last_dev); } /* * Setting fail_last_dev to true to allow last device to be forcibly removed * from RAID1/RAID10. */ static ssize_t fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) { int ret; bool value; ret = kstrtobool(buf, &value); if (ret) return ret; if (value != mddev->fail_last_dev) mddev->fail_last_dev = value; return len; } static struct md_sysfs_entry md_fail_last_dev = __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, fail_last_dev_store); static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); } /* * Setting serialize_policy to true to enforce write IO is not reordered * for raid1. */ static ssize_t serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) { int err; bool value; err = kstrtobool(buf, &value); if (err) return err; if (value == mddev->serialize_policy) return len; err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; } if (value) mddev_create_serial_pool(mddev, NULL); else mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; unlock: mddev_unlock_and_resume(mddev); return err ?: len; } static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, &md_metadata.attr, &md_new_device.attr, &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, NULL, }; static const struct attribute_group md_default_group = { .attrs = md_default_attrs, }; static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, &md_min_sync.attr, &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, &md_bitmap.attr, &md_degraded.attr, NULL, }; static const struct attribute_group md_redundancy_group = { .name = NULL, .attrs = md_redundancy_attrs, }; static const struct attribute_group *md_attr_groups[] = { &md_default_group, NULL, }; static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); mddev_put(mddev); return rv; } static ssize_t md_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; struct kernfs_node *kn = NULL; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (entry->store == array_state_store && cmd_match(page, "clear")) kn = sysfs_break_active_protection(kobj, attr); spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); if (kn) sysfs_unbreak_active_protection(kn); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); if (kn) sysfs_unbreak_active_protection(kn); return rv; } static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); if (legacy_async_del_gendisk) { if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); del_gendisk(mddev->gendisk); } put_disk(mddev->gendisk); } static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; static const struct kobj_type md_ktype = { .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; int mdp_major = 0; /* stack the limit for all rdevs into lim */ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, unsigned int flags) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); if ((flags & MDDEV_STACK_INTEGRITY) && !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); /* apply the extra stacking limits from a new rdev into mddev */ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) { struct queue_limits lim; if (mddev_is_dm(mddev)) return 0; lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { pr_err("%s: incompatible integrity profile for %pg\n", mdname(mddev), rdev->bdev); queue_limits_cancel_update(mddev->gendisk->queue); return -ENXIO; } return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); /* update the optimal I/O size after a reshape */ void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) { struct queue_limits lim; if (mddev_is_dm(mddev)) return; /* don't bother updating io_opt if we can't suspend the array */ if (mddev_suspend(mddev, false) < 0) return; lim = queue_limits_start_update(mddev->gendisk->queue); lim.io_opt = lim.io_min * nr_stripes; queue_limits_commit_update(mddev->gendisk->queue, &lim); mddev_resume(mddev); } EXPORT_SYMBOL_GPL(mddev_update_io_opt); static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); kobject_put(&mddev->kobj); } void md_init_stacking_limits(struct queue_limits *lim) { blk_set_stacking_limits(lim); lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; } EXPORT_SYMBOL_GPL(md_init_stacking_limits); struct mddev *md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with * an arbitrary minor number. It will be "md_???" * If dev is non-zero it must be a device number with a MAJOR of * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then * the device is being created by opening a node in /dev. * If "name" is not NULL, the device is being created by * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; int error; /* * Wait for any previous instance of this device to be completely * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { error = PTR_ERR(mddev); goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); shift = partitioned ? MdpMinorShift : 0; unit = MINOR(mddev->unit) >> shift; if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. */ struct mddev *mddev2; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } if (name && dev) /* * Creating /dev/mdNNN via "newarray", so adjust hold_active. */ mddev->hold_active = UNTIL_STOP; disk = blk_alloc_disk(NULL, NUMA_NO_NODE); if (IS_ERR(disk)) { error = PTR_ERR(disk); goto out_free_mddev; } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; disk->minors = 1 << shift; if (name) strcpy(disk->disk_name, name); else if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); else sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); if (error) goto out_put_disk; kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { /* * The disk is already live at this point. Clear the hold flag * and let mddev_put take care of the deletion, as it isn't any * different from a normal close on last release now. */ mddev->hold_active = 0; mutex_unlock(&disks_mutex); mddev_put(mddev); return ERR_PTR(error); } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); mutex_unlock(&disks_mutex); return mddev; out_put_disk: put_disk(disk); out_free_mddev: mddev_free(mddev); out_unlock: mutex_unlock(&disks_mutex); return ERR_PTR(error); } static int md_alloc_and_put(dev_t dev, char *name) { struct mddev *mddev = md_alloc(dev, name); if (legacy_async_del_gendisk) pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); if (IS_ERR(mddev)) return PTR_ERR(mddev); mddev_put(mddev); return 0; } static void md_probe(dev_t dev) { if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open) md_alloc_and_put(dev, NULL); } static int add_named_array(const char *val, const struct kernel_param *kp) { /* * val must be "md_*" or "mdNNN". * For "md_*" we allocate an array with a large free minor number, and * set the name to val. val must not already be an active name. * For "mdNNN" we a