Total coverage: 223980 (13%)of 1864406
124 124 119 99 99 124 123 124 124 124 124 5 5 116 426 461 462 139 398 27 379 398 19 379 398 125 125 1 9 117 199 195 2 196 196 198 148 266 266 306 307 3 3 2 197 198 195 3 36 79 76 107 2 1 3 125 359 360 360 358 341 19 358 360 360 359 791 669 1 92 29 93 283 69 210 21 8 230 9 7 1 3 28 28 1 2 1 2 4 10 10 17 1 4 18 2 22 20 19 1 15 2 2 3 15 1 509 509 3 1 2 3 1 2 3 8 228 228 210 198 13 209 206 197 14 3 1 2 2 213 202 8 5 2 1 2 2 3 4 5 1 1 2 1 2 2 11 7 5 9 2 10 7 5 8 7 1 11 2 135 5 3 132 110 54 135 159 16 8 137 167 171 141 130 7 5 155 155 128 314 315 1 1 309 1 3 2 2 279 8 28 308 2 2 303 303 28 273 303 225 37 3 1 1 1 1 1 2 153 153 20 139 116 54 164 11 1 163 159 4 162 116 195 46 148 2 193 32 160 1047 1048 5 1033 108 64 67 107 990 5 5 79 1047 1046 5 7 7 2 2 2 380 380 7 1 381 378 380 377 4 4 13 4 377 2 380 249 231 381 14 139 139 139 139 18 13 326 326 326 4 324 3 41 3 3 2 2 186 186 39 39 901 901 330 1 330 328 326 330 325 323 743 33 517 601 734 250 251 248 4 4 250 170 29 144 139 5 124 136 170 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: semantics. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> #include <linux/nospec.h> #include <net/arp.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/addrconf.h> #include "fib_lookup.h" /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ /* Hope, that gcc will optimize it to get rid of dummy loop */ #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ for (nhsel = 0; nhsel < 1; nhsel++) #define change_nexthops(fi) { \ int nhsel; \ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel = 0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ #define endfor_nexthops(fi) } const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, }, [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST, }, [RTN_BROADCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_ANYCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_MULTICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE, }, [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE, }, [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE, }, [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, }, [RTN_NAT] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, }; static void rt_fibinfo_free(struct rtable __rcu **rtp) { struct rtable *rt = rcu_dereference_protected(*rtp, 1); if (!rt) return; /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); * because we waited an RCU grace period before calling * free_fib_info_rcu() */ dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh_common *nhc) { struct fnhe_hash_bucket *hash; int i; hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!hash) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); rt_fibinfo_free(&fnhe->fnhe_rth_output); kfree(fnhe); fnhe = next; } } kfree(hash); } static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) { int cpu; if (!rtp) return; for_each_possible_cpu(cpu) { struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); if (rt) { dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } } free_percpu(rtp); } void fib_nh_common_release(struct fib_nh_common *nhc) { netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); free_nh_exceptions(nhc); } EXPORT_SYMBOL_GPL(fib_nh_common_release); void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif fib_nh_common_release(&fib_nh->nh_common); } /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); if (fi->nh) { nexthop_put(fi->nh); } else { change_nexthops(fi) { fib_nh_release(fi->fib_net, nexthop_nh); } endfor_nexthops(fi); } ip_fib_metrics_put(fi->fib_metrics); kfree(fi); } void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { pr_warn("Freeing alive fib_info %p\n", fi); return; } call_rcu_hurry(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); if (fi->nh) { list_del(&fi->nh_list); } else { change_nexthops(fi) { if (!nexthop_nh->fib_nh_dev) continue; hlist_del_rcu(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } /* Paired with READ_ONCE() from fib_table_lookup() */ WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } } static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { const struct fib_nh *onh; if (fi->nh || ofi->nh) return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; if (ofi->fib_nhs == 0) return 0; for_nexthops(fi) { onh = fib_info_nh(ofi, nhsel); if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || #endif #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; if (nh->fib_nh_gw_family == AF_INET && nh->fib_nh_gw4 != onh->fib_nh_gw4) return -1; if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; } endfor_nexthops(fi); return 0; } static struct hlist_head *fib_nh_head(struct net_device *dev) { return &dev->fib_nh_head; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, u32 prefsrc, u32 priority) { unsigned int val = init_val; val ^= (protocol << 8) | scope; val ^= prefsrc; val ^= priority; return val; } static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, fi->fib_scope, (__force u32)fi->fib_prefsrc, fi->fib_priority); if (fi->nh) { val ^= fi->nh->id; } else { for_nexthops(fi) { val ^= nh->fib_nh_oif; } endfor_nexthops(fi) } return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; } static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, __be32 val) { unsigned int hash_bits = net->ipv4.fib_info_hash_bits; u32 slot; slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head), GFP_KERNEL); } static void fib_info_hash_free(struct hlist_head *head) { kvfree(head); } static void fib_info_hash_grow(struct net *net) { unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; struct hlist_head *new_info_hash, *old_info_hash; unsigned int i; if (net->ipv4.fib_info_cnt < old_size) return; new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); if (!new_info_hash) return; old_info_hash = net->ipv4.fib_info_hash; net->ipv4.fib_info_hash = new_info_hash; net->ipv4.fib_info_hash_bits += 1; for (i = 0; i < old_size; i++) { struct hlist_head *head = &old_info_hash[i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, head, fib_hash) hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); } for (i = 0; i < old_size; i++) { struct hlist_head *lhead = &old_info_hash[old_size + i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) hlist_add_head(&fi->fib_lhash, fib_info_laddrhash_bucket(fi->fib_net, fi->fib_prefsrc)); } fib_info_hash_free(old_info_hash); } /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) { struct hlist_head *head; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn_1(cfg->fc_nh_id, cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && cfg->fc_priority == fi->fib_priority && cfg->fc_type == fi->fib_type && cfg->fc_table == fi->fib_tb_id && !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) return fi; } return NULL; } static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; hlist_for_each_entry(fi, head, fib_hash) { if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && nfi->fib_type == fi->fib_type && nfi->fib_tb_id == fi->fib_tb_id && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && nh_comp(fi, nfi) == 0) return fi; } return NULL; } /* Check, that the gateway is already configured. * Used only by redirect accept routine, under rcu_read_lock(); */ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; head = fib_nh_head(dev); hlist_for_each_entry_rcu(nh, head, nh_hash) { DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { return 0; } } return -1; } size_t fib_nlmsg_size(struct fib_info *fi) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->nh) payload += nla_total_size(4); /* RTA_NH_ID */ if (nhs) { size_t nh_encapsize = 0; /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ for (i = 0; i < fib_info_num_path(fi); i++) { struct fib_nh_common *nhc = fib_info_nhc(fi, i); if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } } /* all nexthops are packed in a nested attribute */ payload += nla_total_size((nhs * nhsize) + nh_encapsize); } return payload; } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); if (!skb) goto errout; fri.fi = fa->fa_info; fri.tb_id = tb_id; fri.dst = key; fri.dst_len = dst_len; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; if (likely(nhc->nhc_gw_family == AF_INET)) n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); else if (nhc->nhc_gw_family == AF_INET6) n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, nhc->nhc_dev); else n = NULL; if (n) { state = READ_ONCE(n->nud_state); neigh_release(n); } else { return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } return 1; } int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *encap, u16 encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { int err; nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, gfp_flags); if (!nhc->nhc_pcpu_rth_output) return -ENOMEM; if (encap) { struct lwtunnel_state *lwtstate; if (encap_type == LWTUNNEL_ENCAP_NONE) { NL_SET_ERR_MSG(extack, "LWT encap type not specified"); err = -EINVAL; goto lwt_failure; } err = lwtunnel_build_state(net, encap_type, encap, nhc->nhc_family, cfg, &lwtstate, extack); if (err) goto lwt_failure; nhc->nhc_lwtstate = lwtstate_get(lwtstate); } return 0; lwt_failure: rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); nhc->nhc_pcpu_rth_output = NULL; return err; } EXPORT_SYMBOL_GPL(fib_nh_common_init); int fib_nh_init(struct net *net, struct fib_nh *nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack) { int err; nh->fib_nh_family = AF_INET; err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, GFP_KERNEL, extack); if (err) return err; nh->fib_nh_oif = cfg->fc_oif; nh->fib_nh_gw_family = cfg->fc_gw_family; if (cfg->fc_gw_family == AF_INET) nh->fib_nh_gw4 = cfg->fc_gw4; else if (cfg->fc_gw_family == AF_INET6) nh->fib_nh_gw6 = cfg->fc_gw6; nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight = nh_weight; #endif return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, struct netlink_ext_ack *extack) { int nhs = 0; while (rtnh_ok(rtnh, remaining)) { nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ if (remaining > 0) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthops"); nhs = 0; } return nhs; } static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla, struct netlink_ext_ack *extack) { if (nla_len(nla) < sizeof(*gw)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY"); return -EINVAL; } *gw = nla_get_in_addr(nla); return 0; } /* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; struct fib_nh *nh; int ret; change_nexthops(fi) { int attrlen; memset(&fib_cfg, 0, sizeof(fib_cfg)); if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; } if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; } fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; fib_cfg.fc_oif = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla, extack); if (ret) goto errout; if (fib_cfg.fc_gw4) fib_cfg.fc_gw_family = AF_INET; } else if (nlav) { ret = fib_gw_from_via(&fib_cfg, nlav, extack); if (ret) goto errout; } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } fib_cfg.fc_flow = nla_get_u32(nla); } fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); /* RTA_ENCAP_TYPE length checked in * lwtunnel_valid_encap_type_attr */ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) fib_cfg.fc_encap_type = nla_get_u16(nla); } ret = fib_nh_init(net, nexthop_nh, &fib_cfg, rtnh->rtnh_hops + 1, extack); if (ret) goto errout; rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); ret = -EINVAL; nh = fib_info_nh(fi, 0); if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; } #endif ret = 0; errout: return ret; } /* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; if (fib_info_num_path(fi) < 2) return; total = 0; for_nexthops(fi) { if (nh->fib_nh_flags & RTNH_F_DEAD) continue; if (ip_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN) continue; total += nh->fib_nh_weight; } endfor_nexthops(fi); w = 0; change_nexthops(fi) { int upper_bound; if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { upper_bound = -1; } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { w += nexthop_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; } atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); } endfor_nexthops(fi); } #else /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); return -EINVAL; } #define fib_rebalance(fi) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, const struct fib_config *cfg, struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; ret = lwtunnel_build_state(net, encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); lwtstate_free(lwtstate); } return result; } int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; #endif if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; if (cfg->fc_nh_id) { if (fi->nh && cfg->fc_nh_id == fi->nh->id) return 0; return 1; } if (fi->nh) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) return 1; return 0; } if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && cfg->fc_flow != nh->nh_tclassid) return 1; #endif if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (!cfg->fc_mp) return 0; rtnh = cfg->fc_mp; remaining = cfg->fc_mp_len; for_nexthops(fi) { int attrlen; if (!rtnh_ok(rtnh, remaining)) return -EINVAL; if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) return 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); int err; nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { __be32 gw; err = fib_gw_from_attr(&gw, nla, extack); if (err) return err; if (nh->fib_nh_gw_family != AF_INET || gw != nh->fib_nh_gw4) return 1; } else if (nlav) { struct fib_config cfg2; err = fib_gw_from_via(&cfg2, nlav, extack); if (err) return err; switch (nh->fib_nh_gw_family) { case AF_INET: if (cfg2.fc_gw_family != AF_INET || cfg2.fc_gw4 != nh->fib_nh_gw4) return 1; break; case AF_INET6: if (cfg2.fc_gw_family != AF_INET6 || ipv6_addr_cmp(&cfg2.fc_gw6, &nh->fib_nh_gw6)) return 1; break; } } #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } if (nla_get_u32(nla) != nh->nh_tclassid) return 1; } #endif } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); #endif return 0; } bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) { struct nlattr *nla; int remaining; if (!cfg->fc_mx) return true; nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); u32 fi_val, val; if (!type) continue; if (type > RTAX_MAX) return false; type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; val = nla_get_u32(nla); } fi_val = fi->fib_metrics->metrics[type - 1]; if (type == RTAX_FEATURES) fi_val &= ~DST_FEATURE_ECN_CA; if (fi_val != val) return false; } return true; } static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, u32 table, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = table, .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, .fc_ifindex = nh->fib_nh_oif, .fc_gateway = nh->fib_nh_gw6, }; struct fib6_nh fib6_nh = {}; int err; err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); if (!err) { nh->fib_nh_dev = fib6_nh.fib_nh_dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); nh->fib_nh_oif = nh->fib_nh_dev->ifindex; nh->fib_nh_scope = RT_SCOPE_LINK; ipv6_stub->fib6_nh_release(&fib6_nh); } return err; } /* * Picture * ------- * * Semantics of nexthop is very messy by historical reasons. * We have to take into account, that: * a) gateway can be actually local interface address, * so that gatewayed route is direct. * b) gateway must be on-link address, possibly * described not by an ifaddr, but also by a direct route. * c) If both gateway and interface are specified, they should not * contradict. * d) If we use tunnel routes, gateway could be not on-link. * * Attempt to reconcile all of these (alas, self-contradictory) conditions * results in pretty ugly and hairy code with obscure logic. * * I chose to generalized it instead, so that the size * of code does not increase practically, but it becomes * much more general. * Every prefix is assigned a "scope" value: "host" is local address, * "link" is direct route, * [ ... "site" ... "interior" ... ] * and "universe" is true gateway route with global meaning. * * Every prefix refers to a set of "nexthop"s (gw, oif), * where gw must have narrower scope. This recursion stops * when gw has LOCAL scope or if "nexthop" is declared ONLINK, * which means that gw is forced to be on link. * * Code is still hairy, but now it is apparently logically * consistent and very flexible. F.e. as by-product it allows * to co-exists in peace independent exterior and interior * routing processes. * * Normally it looks as following. * * {universe prefix} -> (gw, oif) [scope link] * | * |-> {link prefix} -> (gw, oif) [scope local] * | * |-> {local prefix} (terminal node) */ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { struct net_device *dev; struct fib_result res; int err = 0; if (nh->fib_nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; if (scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); return -EINVAL; } dev = __dev_get_by_index(net, nh->fib_nh_oif); if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); return -ENODEV; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); return -ENETDOWN; } addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); if (addr_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); return -EINVAL; } if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; nh->fib_nh_dev = dev; netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_LINK; return 0; } rcu_read_lock(); { struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->fib_nh_gw4, .flowi4_scope = scope + 1, .flowi4_oif = nh->fib_nh_oif, .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) err = fib_table_lookup(tbl, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE | FIB_LOOKUP_NOREF); /* on error or if no table given do full lookup. This * is needed for example when nexthops are in the local * table rather than the given table */ if (!tbl || err) { err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE); } if (err) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } nh->fib_nh_scope = res.scope; nh->fib_nh_oif = FIB_RES_OIF(res); nh->fib_nh_dev = dev = FIB_RES_DEV(res); if (!dev) { NL_SET_ERR_MSG(extack, "No egress device for nexthop gateway"); goto out; } netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; out: rcu_read_unlock(); return err; } static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, struct netlink_ext_ack *extack) { struct in_device *in_dev; int err; if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->fib_nh_oif); if (!in_dev) goto out; err = -ENETDOWN; if (!(in_dev->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; } nh->fib_nh_dev = in_dev->dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_HOST; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; out: rcu_read_unlock(); return err; } int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { int err; if (nh->fib_nh_gw_family == AF_INET) err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else err = fib_check_nh_nongw(net, nh, extack); return err; } __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { struct fib_nh *nh; __be32 saddr; if (nhc->nhc_family != AF_INET) return inet_select_addr(nhc->nhc_dev, 0, scope); nh = container_of(nhc, struct fib_nh, nh_common); saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); WRITE_ONCE(nh->nh_saddr, saddr); WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid)); return saddr; } __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (READ_ONCE(nh->nh_saddr_genid) == atomic_read(&net->ipv4.dev_addr_genid)) return READ_ONCE(nh->nh_saddr); } return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { u32 tb_id = cfg->fc_table; int rc; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, tb_id); if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, RT_TABLE_LOCAL); } if (rc != RTN_LOCAL) return false; } return true; } struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; ASSERT_RTNL(); if (cfg->fc_type > RTN_MAX) goto err_inval; /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; } if (cfg->fc_nh_id) { if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { refcount_inc(&fi->fib_treeref); return fi; } } nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } nhs = 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } #endif fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { err = -ENOBUFS; goto failure; } fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); } fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; if (nh) { if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = 0; fi->nh = nh; } } else { change_nexthops(fi) { nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mp) err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); else err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); } if (err != 0) goto failure; if (fib_props[cfg->fc_type].error) { if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; } goto link_it; } else { switch (cfg->fc_type) { case RTN_UNICAST: case RTN_LOCAL: case RTN_BROADCAST: case RTN_ANYCAST: case RTN_MULTICAST: break; default: NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } if (cfg->fc_scope > RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (fi->nh) { err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); if (err) goto failure; } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ if (nhs != 1) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); goto err_inval; } if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); } else { int linkdown = 0; change_nexthops(fi) { err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, cfg->fc_table, cfg->fc_scope, extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) linkdown++; } endfor_nexthops(fi) if (linkdown == fi->fib_nhs) fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; } if (!fi->nh) { change_nexthops(fi) { fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, fi->fib_scope); if (nexthop_nh->fib_nh_gw_family == AF_INET6) fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); } link_it: ofi = fib_find_info(fi); if (ofi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); refcount_inc(&ofi->fib_treeref); return ofi; } refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); net->ipv4.fib_info_cnt++; hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); if (fi->fib_prefsrc) { struct hlist_head *head; head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, head); } if (fi->nh) { list_add(&fi->nh_list, &nh->fi_list); } else { change_nexthops(fi) { struct hlist_head *head; if (!nexthop_nh->fib_nh_dev) continue; head = fib_nh_head(nexthop_nh->fib_nh_dev); hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } return fi; err_inval: err = -EINVAL; failure: if (fi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); } return ERR_PTR(err); } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, u8 rt_family, unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; if (nhc->nhc_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; rcu_read_lock(); switch (nhc->nhc_family) { case AF_INET: if (ip_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; case AF_INET6: if (ip6_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; } rcu_read_unlock(); } switch (nhc->nhc_gw_family) { case AF_INET: if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) goto nla_put_failure; break; case AF_INET6: /* if gateway family does not match nexthop family * gateway is encoded as RTA_VIA */ if (rt_family != nhc->nhc_gw_family) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); } else if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0) { goto nla_put_failure; } break; } *flags |= (nhc->nhc_flags & (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); if (!skip_oif && nhc->nhc_dev && nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) goto nla_put_failure; if (nhc->nhc_lwtstate && lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; unsigned char flags = 0; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_hops = nh_weight - 1; rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_add_nexthop); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (unlikely(fi->nh)) { if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0) goto nla_put_failure; goto mp_end; } for_nexthops(fi) { u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID nh_tclassid = nh->nh_tclassid; #endif if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, AF_INET, nh_tclassid) < 0) goto nla_put_failure; } endfor_nexthops(fi); mp_end: nla_nest_end(skb, mp); return 0; nla_put_failure: return -EMSGSIZE; } #else static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { return 0; } #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags) { unsigned int nhs = fib_info_num_path(fri->fi); struct fib_info *fi = fri->fi; u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp); if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->nh) { if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode)) goto offload; } if (nhs == 1) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (nh->nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; } #endif } else { if (fib_add_multipath(skb, fi) < 0) goto nla_put_failure; } offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; if (fri->offload_failed) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* * Update FIB if: * - local address disappeared -> we must delete all the entries * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ int fib_sync_down_addr(struct net_device *dev, __be32 local) { int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); struct hlist_head *head; struct fib_info *fi; int ret = 0; if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net) || fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; fi->pfsrc_removed = true; ret++; } } return ret; } static int call_fib_nh_notifiers(struct fib_nh *nh, enum fib_event_type event_type) { bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); struct fib_nh_notifier_info info = { .fib_nh = nh, }; switch (event_type) { case FIB_EVENT_NH_ADD: if (nh->fib_nh_flags & RTNH_F_DEAD) break; if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) break; return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || (nh->fib_nh_flags & RTNH_F_DEAD)) return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); break; default: break; } return NOTIFY_DONE; } /* Update the PMTU of exceptions when: * - the new MTU of the first hop becomes smaller than the PMTU * - the old MTU was the same as the PMTU, and it limited discovery of * larger MTUs on the path. With that limit raised, we can now * discover larger MTUs * A special case is locked exceptions, for which the PMTU is smaller * than the minimal accepted PMTU: * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!bucket) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); fnhe; fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { if (fnhe->fnhe_mtu_locked) { if (new <= fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; fnhe->fnhe_mtu_locked = false; } } else if (new < fnhe->fnhe_pmtu || orig == fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; } } } } void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { struct hlist_head *head = fib_nh_head(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed * * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { struct hlist_head *head = fib_nh_head(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; int ret = 0; if (force) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int dead; BUG_ON(!fi->fib_nhs); DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (fi == prev_fi) continue; prev_fi = fi; dead = 0; change_nexthops(fi) { if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) dead++; else if (nexthop_nh->fib_nh_dev == dev && nexthop_nh->fib_nh_scope != scope) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; break; } call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_DEL); dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (event == NETDEV_UNREGISTER && nexthop_nh->fib_nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: fi->fib_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: fi->fib_flags |= RTNH_F_LINKDOWN; break; } ret++; } fib_rebalance(fi); } return ret; } /* Must be invoked inside of an RCU protected region. */ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; struct fib_alias *fa, *fa1 = NULL; u32 last_prio = res->fi->fib_priority; dscp_t last_dscp = 0; hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; struct fib_nh_common *nhc; if (fa->fa_slen != slen) continue; if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; if (fa->tb_id != tb->tb_id) continue; if (next_fi->fib_priority > last_prio && fa->fa_dscp == last_dscp) { if (last_dscp) continue; break; } if (next_fi->fib_flags & RTNH_F_DEAD) continue; last_dscp = fa->fa_dscp; last_prio = next_fi->fib_priority; if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; nhc = fib_info_nhc(next_fi, 0); if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); if (!fi) { if (next_fi != res->fi) break; fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } fi = next_fi; order++; } if (order <= 0 || !fi) { if (fa1) fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); fa1->fa_default = last_idx; out: return; } /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. * * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; struct hlist_head *head; struct fib_nh *nh; int ret; if (!(dev->flags & IFF_UP)) return 0; if (nh_flags & RTNH_F_DEAD) { unsigned int flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) nh_flags |= RTNH_F_LINKDOWN; } prev_fi = NULL; head = fib_nh_head(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int alive; BUG_ON(!fi->fib_nhs); DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (fi == prev_fi) continue; prev_fi = fi; alive = 0; change_nexthops(fi) { if (!(nexthop_nh->fib_nh_flags & nh_flags)) { alive++; continue; } if (!nexthop_nh->fib_nh_dev || !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->fib_nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; nexthop_nh->fib_nh_flags &= ~nh_flags; call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } fib_rebalance(fi); } return ret; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static bool fib_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; if (nh->fib_nh_scope == RT_SCOPE_LINK) { struct neighbour *n; rcu_read_lock(); if (likely(nh->fib_nh_gw_family == AF_INET)) n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); else if (nh->fib_nh_gw_family == AF_INET6) n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); else n = NULL; if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); } return !!(state & NUD_VALID); } void fib_select_multipath(struct fib_result *res, int hash, const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; bool found = false; bool use_neigh; __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); saddr = fl4 ? fl4->saddr : 0; change_nexthops(fi) { int nh_upper_bound; /* Nexthops without a carrier are assigned an upper bound of * minus one when "ignore_routes_with_linkdown" is set. */ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); if (nh_upper_bound == -1 || (use_neigh && !fib_good_nh(nexthop_nh))) continue; if (!found) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; found = !saddr || nexthop_nh->nh_saddr == saddr; } if (hash > nh_upper_bound) continue; if (!saddr || nexthop_nh->nh_saddr == saddr) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; return; } if (found) return; } endfor_nexthops(fi); } #endif void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { if (fl4->flowi4_oif) goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h, fl4); } else #endif if (!res->prefixlen && res->table->tb_num_default > 1 && res->type == RTN_UNICAST) fib_select_default(fl4, res); check_saddr: if (!fl4->saddr) { struct net_device *l3mdev; l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev); if (!l3mdev || l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev) fl4->saddr = fib_result_prefsrc(net, res); else fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } int __net_init fib4_semantics_init(struct net *net) { unsigned int hash_bits = 4; net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); if (!net->ipv4.fib_info_hash) return -ENOMEM; net->ipv4.fib_info_hash_bits = hash_bits; net->ipv4.fib_info_cnt = 0; return 0; } void __net_exit fib4_semantics_exit(struct net *net) { fib_info_hash_free(net->ipv4.fib_info_hash); }
169 170 170 169 170 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/caif/cfpkt.h> #include <net/caif/cfmuxl.h> #include <net/caif/cfsrvl.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cfmuxl, layer) #define CAIF_CTRL_CHANNEL 0 #define UP_CACHE_SIZE 8 #define DN_CACHE_SIZE 8 struct cfmuxl { struct cflayer layer; struct list_head srvl_list; struct list_head frml_list; struct cflayer *up_cache[UP_CACHE_SIZE]; struct cflayer *dn_cache[DN_CACHE_SIZE]; /* * Set when inserting or removing downwards layers. */ spinlock_t transmit_lock; /* * Set when inserting or removing upwards layers. */ spinlock_t receive_lock; }; static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); struct cflayer *cfmuxl_create(void) { struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC); if (!this) return NULL; this->layer.receive = cfmuxl_receive; this->layer.transmit = cfmuxl_transmit; this->layer.ctrlcmd = cfmuxl_ctrlcmd; INIT_LIST_HEAD(&this->srvl_list); INIT_LIST_HEAD(&this->frml_list); spin_lock_init(&this->transmit_lock); spin_lock_init(&this->receive_lock); snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux"); return &this->layer; } int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid) { struct cfmuxl *muxl = (struct cfmuxl *) layr; spin_lock_bh(&muxl->transmit_lock); list_add_rcu(&dn->node, &muxl->frml_list); spin_unlock_bh(&muxl->transmit_lock); return 0; } static struct cflayer *get_from_id(struct list_head *list, u16 id) { struct cflayer *lyr; list_for_each_entry_rcu(lyr, list, node) { if (lyr->id == id) return lyr; } return NULL; } int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *old; spin_lock_bh(&muxl->receive_lock); /* Two entries with same id is wrong, so remove old layer from mux */ old = get_from_id(&muxl->srvl_list, linkid); if (old != NULL) list_del_rcu(&old->node); list_add_rcu(&up->node, &muxl->srvl_list); spin_unlock_bh(&muxl->receive_lock); return 0; } struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *dn; int idx = phyid % DN_CACHE_SIZE; spin_lock_bh(&muxl->transmit_lock); RCU_INIT_POINTER(muxl->dn_cache[idx], NULL); dn = get_from_id(&muxl->frml_list, phyid); if (dn == NULL) goto out; list_del_rcu(&dn->node); caif_assert(dn != NULL); out: spin_unlock_bh(&muxl->transmit_lock); return dn; } static struct cflayer *get_up(struct cfmuxl *muxl, u16 id) { struct cflayer *up; int idx = id % UP_CACHE_SIZE; up = rcu_dereference(muxl->up_cache[idx]); if (up == NULL || up->id != id) { spin_lock_bh(&muxl->receive_lock); up = get_from_id(&muxl->srvl_list, id); rcu_assign_pointer(muxl->up_cache[idx], up); spin_unlock_bh(&muxl->receive_lock); } return up; } static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info) { struct cflayer *dn; int idx = dev_info->id % DN_CACHE_SIZE; dn = rcu_dereference(muxl->dn_cache[idx]); if (dn == NULL || dn->id != dev_info->id) { spin_lock_bh(&muxl->transmit_lock); dn = get_from_id(&muxl->frml_list, dev_info->id); rcu_assign_pointer(muxl->dn_cache[idx], dn); spin_unlock_bh(&muxl->transmit_lock); } return dn; } struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id) { struct cflayer *up; struct cfmuxl *muxl = container_obj(layr); int idx = id % UP_CACHE_SIZE; if (id == 0) { pr_warn("Trying to remove control layer\n"); return NULL; } spin_lock_bh(&muxl->receive_lock); up = get_from_id(&muxl->srvl_list, id); if (up == NULL) goto out; RCU_INIT_POINTER(muxl->up_cache[idx], NULL); list_del_rcu(&up->node); out: spin_unlock_bh(&muxl->receive_lock); return up; } static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) { int ret; struct cfmuxl *muxl = container_obj(layr); u8 id; struct cflayer *up; if (cfpkt_extr_head(pkt, &id, 1) < 0) { pr_err("erroneous Caif Packet\n"); cfpkt_destroy(pkt); return -EPROTO; } rcu_read_lock(); up = get_up(muxl, id); if (up == NULL) { pr_debug("Received data on unknown link ID = %d (0x%x)" " up == NULL", id, id); cfpkt_destroy(pkt); /* * Don't return ERROR, since modem misbehaves and sends out * flow on before linksetup response. */ rcu_read_unlock(); return /* CFGLU_EPROT; */ 0; } /* We can't hold rcu_lock during receive, so take a ref count instead */ cfsrvl_get(up); rcu_read_unlock(); ret = up->receive(up, pkt); cfsrvl_put(up); return ret; } static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) { struct cfmuxl *muxl = container_obj(layr); int err; u8 linkid; struct cflayer *dn; struct caif_payload_info *info = cfpkt_info(pkt); BUG_ON(!info); rcu_read_lock(); dn = get_dn(muxl, info->dev_info); if (dn == NULL) { pr_debug("Send data on unknown phy ID = %d (0x%x)\n", info->dev_info->id, info->dev_info->id); rcu_read_unlock(); cfpkt_destroy(pkt); return -ENOTCONN; } info->hdr_len += 1; linkid = info->channel_id; cfpkt_add_head(pkt, &linkid, 1); /* We can't hold rcu_lock during receive, so take a ref count instead */ cffrml_hold(dn); rcu_read_unlock(); err = dn->transmit(dn, pkt); cffrml_put(dn); return err; } static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) { if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && layer->id != 0) cfmuxl_remove_uplayer(layr, layer->id); /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } } rcu_read_unlock(); }
148 154 1 149 505 509 503 203 181 133 6 160 124 158 122 12 29 2 2 3 20 3 448 448 1 119 468 380 138 2 432 122 153 471 472 153 153 152 1 472 145 472 144 472 145 4 435 121 431 132 126 127 131 131 430 432 432 431 138 138 431 432 114 20 432 432 432 431 432 2 121 120 75 75 197 49 160 2 160 2 156 17 1 2 14 7 9 185 1 1 181 2 166 37 4 1 2 1 3 2 7 149 7 149 145 6 144 149 141 2 126 2 3 4 149 149 149 149 149 1 148 148 148 1 147 146 146 1 146 142 142 145 145 144 144 144 149 148 151 3 149 145 3 152 149 6 3 156 1 149 6 155 152 143 12 3 5 1 7 17 15 4 9 12 10 11 4 17 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * cpuid support routines * * derived from arch/x86/kvm/x86.c * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "linux/lockdep.h" #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <linux/sched/stat.h> #include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> #include <asm/cpuid/api.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" #include "trace.h" #include "pmu.h" #include "xen.h" /* * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); struct cpuid_xstate_sizes { u32 eax; u32 ebx; u32 ecx; }; static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init; void __init kvm_init_xstate_sizes(void) { u32 ign; int i; for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) { struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign); } } u32 xstate_required_size(u64 xstate_bv, bool compacted) { u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; int i; xstate_bv &= XFEATURE_MASK_EXTEND; for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) { struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; u32 offset; if (!(xstate_bv & BIT_ULL(i))) continue; /* ECX[1]: 64B alignment in compacted form */ if (compacted) offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; else offset = xs->ebx; ret = max(ret, offset + xs->eax); xstate_bv &= ~BIT_ULL(i); } return ret; } struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; /* * KVM has a semi-arbitrary rule that querying the guest's CPUID model * with IRQs disabled is disallowed. The CPUID model can legitimately * have over one hundred entries, i.e. the lookup is slow, and IRQs are * typically disabled in KVM only when KVM is in a performance critical * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break * if this rule is violated, this assertion is purely to flag potential * performance issues. If this fires, consider moving the lookup out * of the hotpath, e.g. by caching information during CPUID updates. */ lockdep_assert_irqs_enabled(); for (i = 0; i < nent; i++) { e = &entries[i]; if (e->function != function) continue; /* * If the index isn't significant, use the first entry with a * matching function. It's userspace's responsibility to not * provide "duplicate" entries in all cases. */ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) return e; /* * Similarly, use the first matching entry if KVM is doing a * lookup (as opposed to emulating CPUID) for a function that's * architecturally defined as not having a significant index. */ if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { /* * Direct lookups from KVM should not diverge from what * KVM defines internally (the architectural behavior). */ WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; } } return NULL; } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; u64 xfeatures; /* * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) return -EINVAL; } /* * Exposing dynamic xfeatures to the guest requires additional * enabling in the FPU, e.g. to expand the guest XSAVE state size. */ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0); if (!best) return 0; xfeatures = best->eax | ((u64)best->edx << 32); xfeatures &= XFEATURE_MASK_USER_DYNAMIC; if (!xfeatures) return 0; return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu); static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { struct kvm_cpuid_entry2 *orig; int i; /* * Apply runtime CPUID updates to the incoming CPUID entries to avoid * false positives due mismatches on KVM-owned feature flags. * * Note! @e2 and @nent track the _old_ CPUID entries! */ kvm_update_cpuid_runtime(vcpu); kvm_apply_cpuid_pv_features_quirk(vcpu); if (nent != vcpu->arch.cpuid_nent) return -EINVAL; for (i = 0; i < nent; i++) { orig = &vcpu->arch.cpuid_entries[i]; if (e2[i].function != orig->function || e2[i].index != orig->index || e2[i].flags != orig->flags || e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) return -EINVAL; } return 0; } static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; for_each_possible_cpuid_base_hypervisor(base) { entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; signature[0] = entry->ebx; signature[1] = entry->ecx; signature[2] = entry->edx; if (!memcmp(signature, sig, sizeof(signature))) { cpuid.base = base; cpuid.limit = entry->eax; break; } } } return cpuid; } static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu) { struct kvm_hypervisor_cpuid kvm_cpuid; struct kvm_cpuid_entry2 *best; kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); if (!kvm_cpuid.base) return 0; best = kvm_find_cpuid_entry(vcpu, kvm_cpuid.base | KVM_CPUID_FEATURES); if (!best) return 0; if (kvm_hlt_in_guest(vcpu->kvm)) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); return best->eax; } /* * Calculate guest's supported XCR0 taking into account guest CPUID data and * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0); if (!best) return 0; return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entry, unsigned int x86_feature, bool has_feature) { cpuid_entry_change(entry, x86_feature, has_feature); guest_cpu_cap_change(vcpu, x86_feature, has_feature); } static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; vcpu->arch.cpuid_dynamic_bits_dirty = false; best = kvm_find_cpuid_entry(vcpu, 1); if (best) { kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE, kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)); kvm_update_feature_runtime(vcpu, best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) kvm_update_feature_runtime(vcpu, best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } best = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (best) kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSPKE, kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); best = kvm_find_cpuid_entry_index(vcpu, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); } static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV struct kvm_cpuid_entry2 *entry; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; #else return false; #endif } static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; entry = kvm_find_cpuid_entry(vcpu, 0); if (!entry) return false; return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) || is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx); } /* * This isn't truly "unsafe", but except for the cpu_caps initialization code, * all register lookups should use __cpuid_entry_get_reg(), which provides * compile-time validation of the input. */ static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg) { switch (reg) { case CPUID_EAX: return entry->eax; case CPUID_EBX: return entry->ebx; case CPUID_ECX: return entry->ecx; case CPUID_EDX: return entry->edx; default: WARN_ON_ONCE(1); return 0; } } static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, bool include_partially_emulated); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; struct kvm_cpuid_entry2 *entry; bool allow_gbpages; int i; memset(vcpu->arch.cpu_caps, 0, sizeof(vcpu->arch.cpu_caps)); BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS); /* * Reset guest capabilities to userspace's guest CPUID definition, i.e. * honor userspace's definition for features that don't require KVM or * hardware management/support (or that KVM simply doesn't care about). */ for (i = 0; i < NR_KVM_CPU_CAPS; i++) { const struct cpuid_reg cpuid = reverse_cpuid[i]; struct kvm_cpuid_entry2 emulated; if (!cpuid.function) continue; entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) continue; cpuid_func_emulated(&emulated, cpuid.function, true); /* * A vCPU has a feature if it's supported by KVM and is enabled * in guest CPUID. Note, this includes features that are * supported by KVM but aren't advertised to userspace! */ vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] | cpuid_get_reg_unsafe(&emulated, cpuid.reg); vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, cpuid.reg); } kvm_update_cpuid_runtime(vcpu); /* * If TDP is enabled, let the guest use GBPAGES if they're supported in * hardware. The hardware page walker doesn't let KVM disable GBPAGES, * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA * walk for performance and complexity reasons. Not to mention KVM * _can't_ solve the problem because GVA->GPA walks aren't visible to * KVM once a TDP translation is installed. Mimic hardware behavior so * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. * If TDP is disabled, honor *only* guest CPUID as KVM has full control * and can install smaller shadow pages if the host lacks 1GiB support. */ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES); guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; else apic->lapic_timer.timer_mode_mask = 1 << 17; kvm_apic_set_version(vcpu); } vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) | __cr4_reserved_bits(guest_cpu_cap_has, vcpu); #undef __kvm_cpu_cap_has kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu)); /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_call(vcpu_after_set_cpuid)(vcpu); /* * Except for the MMU, which needs to do its thing any vendor specific * adjustments to the reserved GPA bits. */ kvm_mmu_after_set_cpuid(vcpu); } int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: return 36; } int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return (best->eax >> 16) & 0xff; not_found: return 0; } /* * This "raw" version returns the reserved GPA bits without any adjustments for * encryption technologies that usurp bits. The raw mask should be used if and * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs. */ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) { return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { u32 vcpu_caps[NR_KVM_CPU_CAPS]; int r; /* * Swap the existing (old) entries with the incoming (new) entries in * order to massage the new entries, e.g. to account for dynamic bits * that KVM controls, without clobbering the current guest CPUID, which * KVM needs to preserve in order to unwind on failure. * * Similarly, save the vCPU's current cpu_caps so that the capabilities * can be updated alongside the CPUID entries when performing runtime * updates. Full initialization is done if and only if the vCPU hasn't * run, i.e. only if userspace is potentially changing CPUID features. */ swap(vcpu->arch.cpuid_entries, e2); swap(vcpu->arch.cpuid_nent, nent); memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps)); BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); /* * KVM does not correctly handle changing guest CPUID after KVM_RUN, as * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with * the core vCPU model on the fly. It would've been better to forbid any * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ if (kvm_vcpu_has_run(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) goto err; goto success; } #ifdef CONFIG_KVM_HYPERV if (kvm_cpuid_has_hyperv(vcpu)) { r = kvm_hv_vcpu_init(vcpu); if (r) goto err; } #endif r = kvm_check_cpuid(vcpu); if (r) goto err; #ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); #endif kvm_vcpu_after_set_cpuid(vcpu); success: kvfree(e2); return 0; err: memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps)); swap(vcpu->arch.cpuid_entries, e2); swap(vcpu->arch.cpuid_nent, nent); return r; } /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries) { int r, i; struct kvm_cpuid_entry *e = NULL; struct kvm_cpuid_entry2 *e2 = NULL; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) return -E2BIG; if (cpuid->nent) { e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e)); if (IS_ERR(e)) return PTR_ERR(e); e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); if (!e2) { r = -ENOMEM; goto out_free_cpuid; } } for (i = 0; i < cpuid->nent; i++) { e2[i].function = e[i].function; e2[i].eax = e[i].eax; e2[i].ebx = e[i].ebx; e2[i].ecx = e[i].ecx; e2[i].edx = e[i].edx; e2[i].index = 0; e2[i].flags = 0; e2[i].padding[0] = 0; e2[i].padding[1] = 0; e2[i].padding[2] = 0; } r = kvm_set_cpuid(vcpu, e2, cpuid->nent); if (r) kvfree(e2); out_free_cpuid: kvfree(e); return r; } int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *e2 = NULL; int r; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) return -E2BIG; if (cpuid->nent) { e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2)); if (IS_ERR(e2)) return PTR_ERR(e2); } r = kvm_set_cpuid(vcpu, e2, cpuid->nent); if (r) kvfree(e2); return r; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; if (vcpu->arch.cpuid_dynamic_bits_dirty) kvm_update_cpuid_runtime(vcpu); if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; cpuid->nent = vcpu->arch.cpuid_nent; return 0; } static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid) { struct kvm_cpuid_entry2 entry; u32 base; /* * KVM only supports features defined by Intel (0x0), AMD (0x80000000), * and Centaur (0xc0000000). WARN if a feature for new vendor base is * defined, as this and other code would need to be updated. */ base = cpuid.function & 0xffff0000; if (WARN_ON_ONCE(base && base != 0x80000000 && base != 0xc0000000)) return 0; if (cpuid_eax(base) < cpuid.function) return 0; cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); return *__cpuid_entry_get_reg(&entry, cpuid.reg); } /* * For kernel-defined leafs, mask KVM's supported feature set with the kernel's * capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw * CPUID, as KVM is the one and only authority (in the kernel). */ #define kvm_cpu_cap_init(leaf, feature_initializers...) \ do { \ const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \ const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \ const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \ u32 kvm_cpu_cap_passthrough = 0; \ u32 kvm_cpu_cap_synthesized = 0; \ u32 kvm_cpu_cap_emulated = 0; \ u32 kvm_cpu_cap_features = 0; \ \ feature_initializers \ \ kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \ \ if (leaf < NCAPINTS) \ kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \ \ kvm_cpu_caps[leaf] |= kvm_cpu_cap_passthrough; \ kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) | \ kvm_cpu_cap_synthesized); \ kvm_cpu_caps[leaf] |= kvm_cpu_cap_emulated; \ } while (0) /* * Assert that the feature bit being declared, e.g. via F(), is in the CPUID * word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX * features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX. */ #define KVM_VALIDATE_CPU_CAP_USAGE(name) \ do { \ u32 __leaf = __feature_leaf(X86_FEATURE_##name); \ \ BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \ } while (0) #define F(name) \ ({ \ KVM_VALIDATE_CPU_CAP_USAGE(name); \ kvm_cpu_cap_features |= feature_bit(name); \ }) /* Scattered Flag - For features that are scattered by cpufeatures.h. */ #define SCATTERED_F(name) \ ({ \ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ KVM_VALIDATE_CPU_CAP_USAGE(name); \ if (boot_cpu_has(X86_FEATURE_##name)) \ F(name); \ }) /* Features that KVM supports only on 64-bit kernels. */ #define X86_64_F(name) \ ({ \ KVM_VALIDATE_CPU_CAP_USAGE(name); \ if (IS_ENABLED(CONFIG_X86_64)) \ F(name); \ }) /* * Emulated Feature - For features that KVM emulates in software irrespective * of host CPU/kernel support. */ #define EMULATED_F(name) \ ({ \ kvm_cpu_cap_emulated |= feature_bit(name); \ F(name); \ }) /* * Synthesized Feature - For features that are synthesized into boot_cpu_data, * i.e. may not be present in the raw CPUID, but can still be advertised to * userspace. Primarily used for mitigation related feature flags. */ #define SYNTHESIZED_F(name) \ ({ \ kvm_cpu_cap_synthesized |= feature_bit(name); \ F(name); \ }) /* * Passthrough Feature - For features that KVM supports based purely on raw * hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't * use the feature. Simply force set the feature in KVM's capabilities, raw * CPUID support will be factored in by kvm_cpu_cap_mask(). */ #define PASSTHROUGH_F(name) \ ({ \ kvm_cpu_cap_passthrough |= feature_bit(name); \ F(name); \ }) /* * Aliased Features - For features in 0x8000_0001.EDX that are duplicates of * identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001. */ #define ALIASED_1_EDX_F(name) \ ({ \ BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \ BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \ kvm_cpu_cap_features |= feature_bit(name); \ }) /* * Vendor Features - For features that KVM supports, but are added in later * because they require additional vendor enabling. */ #define VENDOR_F(name) \ ({ \ KVM_VALIDATE_CPU_CAP_USAGE(name); \ }) /* * Runtime Features - For features that KVM dynamically sets/clears at runtime, * e.g. when CR4 changes, but which are never advertised to userspace. */ #define RUNTIME_F(name) \ ({ \ KVM_VALIDATE_CPU_CAP_USAGE(name); \ }) /* * Undefine the MSR bit macro to avoid token concatenation issues when * processing X86_FEATURE_SPEC_CTRL_SSBD. */ #undef SPEC_CTRL_SSBD /* DS is defined by ptrace-abi.h on 32-bit builds. */ #undef DS void kvm_set_cpu_caps(void) { memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); kvm_cpu_cap_init(CPUID_1_ECX, F(XMM3), F(PCLMULQDQ), VENDOR_F(DTES64), /* * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not* * advertised to guests via CPUID! MWAIT is also technically a * runtime flag thanks to IA32_MISC_ENABLES; mark it as such so * that KVM is aware that it's a known, unadvertised flag. */ RUNTIME_F(MWAIT), /* DS-CPL */ VENDOR_F(VMX), /* SMX, EST */ /* TM2 */ F(SSSE3), /* CNXT-ID */ /* Reserved */ F(FMA), F(CX16), /* xTPR Update */ F(PDCM), F(PCID), /* Reserved, DCA */ F(XMM4_1), F(XMM4_2), EMULATED_F(X2APIC), F(MOVBE), F(POPCNT), EMULATED_F(TSC_DEADLINE_TIMER), F(AES), F(XSAVE), RUNTIME_F(OSXSAVE), F(AVX), F(F16C), F(RDRAND), EMULATED_F(HYPERVISOR), ); kvm_cpu_cap_init(CPUID_1_EDX, F(FPU), F(VME), F(DE), F(PSE), F(TSC), F(MSR), F(PAE), F(MCE), F(CX8), F(APIC), /* Reserved */ F(SEP), F(MTRR), F(PGE), F(MCA), F(CMOV), F(PAT), F(PSE36), /* PSN */ F(CLFLUSH), /* Reserved */ VENDOR_F(DS), /* ACPI */ F(MMX), F(FXSR), F(XMM), F(XMM2), F(SELFSNOOP), /* HTT, TM, Reserved, PBE */ ); kvm_cpu_cap_init(CPUID_7_0_EBX, F(FSGSBASE), EMULATED_F(TSC_ADJUST), F(SGX), F(BMI1), F(HLE), F(AVX2), F(FDP_EXCPTN_ONLY), F(SMEP), F(BMI2), F(ERMS), F(INVPCID), F(RTM), F(ZERO_FCS_FDS), VENDOR_F(MPX), F(AVX512F), F(AVX512DQ), F(RDSEED), F(ADX), F(SMAP), F(AVX512IFMA), F(CLFLUSHOPT), F(CLWB), VENDOR_F(INTEL_PT), F(AVX512PF), F(AVX512ER), F(AVX512CD), F(SHA_NI), F(AVX512BW), F(AVX512VL), ); kvm_cpu_cap_init(CPUID_7_ECX, F(AVX512VBMI), PASSTHROUGH_F(LA57), F(PKU), RUNTIME_F(OSPKE), F(RDPID), F(AVX512_VPOPCNTDQ), F(UMIP), F(AVX512_VBMI2), F(GFNI), F(VAES), F(VPCLMULQDQ), F(AVX512_VNNI), F(AVX512_BITALG), F(CLDEMOTE), F(MOVDIRI), F(MOVDIR64B), VENDOR_F(WAITPKG), F(SGX_LC), F(BUS_LOCK_DETECT), ); /* * PKU not yet implemented for shadow paging and requires OSPKE * to be set on the host. Clear it if that is not the case */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); kvm_cpu_cap_init(CPUID_7_EDX, F(AVX512_4VNNIW), F(AVX512_4FMAPS), F(SPEC_CTRL), F(SPEC_CTRL_SSBD), EMULATED_F(ARCH_CAPABILITIES), F(INTEL_STIBP), F(MD_CLEAR), F(AVX512_VP2INTERSECT), F(FSRM), F(SERIALIZE), F(TSXLDTRK), F(AVX512_FP16), F(AMX_TILE), F(AMX_INT8), F(AMX_BF16), F(FLUSH_L1D), ); if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && boot_cpu_has(X86_FEATURE_AMD_IBPB) && boot_cpu_has(X86_FEATURE_AMD_IBRS)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL); if (boot_cpu_has(X86_FEATURE_STIBP)) kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_init(CPUID_7_1_EAX, F(SHA512), F(SM3), F(SM4), F(AVX_VNNI), F(AVX512_BF16), F(CMPCCXADD), F(FZRM), F(FSRS), F(FSRC), F(WRMSRNS), F(AMX_FP16), F(AVX_IFMA), F(LAM), ); kvm_cpu_cap_init(CPUID_7_1_EDX, F(AVX_VNNI_INT8), F(AVX_NE_CONVERT), F(AMX_COMPLEX), F(AVX_VNNI_INT16), F(PREFETCHITI), F(AVX10), ); kvm_cpu_cap_init(CPUID_7_2_EDX, F(INTEL_PSFD), F(IPRED_CTRL), F(RRSBA_CTRL), F(DDPD_U), F(BHI_CTRL), F(MCDT_NO), ); kvm_cpu_cap_init(CPUID_D_1_EAX, F(XSAVEOPT), F(XSAVEC), F(XGETBV1), F(XSAVES), X86_64_F(XFD), ); kvm_cpu_cap_init(CPUID_12_EAX, SCATTERED_F(SGX1), SCATTERED_F(SGX2), SCATTERED_F(SGX_EDECCSSA), ); kvm_cpu_cap_init(CPUID_24_0_EBX, F(AVX10_128), F(AVX10_256), F(AVX10_512), ); kvm_cpu_cap_init(CPUID_8000_0001_ECX, F(LAHF_LM), F(CMP_LEGACY), VENDOR_F(SVM), /* ExtApicSpace */ F(CR8_LEGACY), F(ABM), F(SSE4A), F(MISALIGNSSE), F(3DNOWPREFETCH), F(OSVW), /* IBS */ F(XOP), /* SKINIT, WDT, LWP */ F(FMA4), F(TBM), F(TOPOEXT), VENDOR_F(PERFCTR_CORE), ); kvm_cpu_cap_init(CPUID_8000_0001_EDX, ALIASED_1_EDX_F(FPU), ALIASED_1_EDX_F(VME), ALIASED_1_EDX_F(DE), ALIASED_1_EDX_F(PSE), ALIASED_1_EDX_F(TSC), ALIASED_1_EDX_F(MSR), ALIASED_1_EDX_F(PAE), ALIASED_1_EDX_F(MCE), ALIASED_1_EDX_F(CX8), ALIASED_1_EDX_F(APIC), /* Reserved */ F(SYSCALL), ALIASED_1_EDX_F(MTRR), ALIASED_1_EDX_F(PGE), ALIASED_1_EDX_F(MCA), ALIASED_1_EDX_F(CMOV), ALIASED_1_EDX_F(PAT), ALIASED_1_EDX_F(PSE36), /* Reserved */ F(NX), /* Reserved */ F(MMXEXT), ALIASED_1_EDX_F(MMX), ALIASED_1_EDX_F(FXSR), F(FXSR_OPT), X86_64_F(GBPAGES), F(RDTSCP), /* Reserved */ X86_64_F(LM), F(3DNOWEXT), F(3DNOW), ); if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); kvm_cpu_cap_init(CPUID_8000_0007_EDX, SCATTERED_F(CONSTANT_TSC), ); kvm_cpu_cap_init(CPUID_8000_0008_EBX, F(CLZERO), F(XSAVEERPTR), F(WBNOINVD), F(AMD_IBPB), F(AMD_IBRS), F(AMD_SSBD), F(VIRT_SSBD), F(AMD_SSB_NO), F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), F(AMD_IBRS_SAME_MODE), F(AMD_PSFD), F(AMD_IBPB_RET), ); /* * AMD has separate bits for each SPEC_CTRL bit. * arch/x86/kernel/cpu/bugs.c is kind enough to * record that in cpufeatures so use them. */ if (boot_cpu_has(X86_FEATURE_IBPB)) { kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB); if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && !boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB_RET); } if (boot_cpu_has(X86_FEATURE_IBRS)) kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS); if (boot_cpu_has(X86_FEATURE_STIBP)) kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP); if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO); /* * The preference is to use SPEC CTRL MSR instead of the * VIRT_SPEC MSR. */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && !boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); /* All SVM features required additional vendor module enabling. */ kvm_cpu_cap_init(CPUID_8000_000A_EDX, VENDOR_F(NPT), VENDOR_F(VMCBCLEAN), VENDOR_F(FLUSHBYASID), VENDOR_F(NRIPS), VENDOR_F(TSCRATEMSR), VENDOR_F(V_VMSAVE_VMLOAD), VENDOR_F(LBRV), VENDOR_F(PAUSEFILTER), VENDOR_F(PFTHRESHOLD), VENDOR_F(VGIF), VENDOR_F(VNMI), VENDOR_F(SVME_ADDR_CHK), ); kvm_cpu_cap_init(CPUID_8000_001F_EAX, VENDOR_F(SME), VENDOR_F(SEV), /* VM_PAGE_FLUSH */ VENDOR_F(SEV_ES), F(SME_COHERENT), ); kvm_cpu_cap_init(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP), F(WRMSR_XX_BASE_NS), /* * Synthesize "LFENCE is serializing" into the AMD-defined entry * in KVM's supported CPUID, i.e. if the feature is reported as * supported by the kernel. LFENCE_RDTSC was a Linux-defined * synthetic feature long before AMD joined the bandwagon, e.g. * LFENCE is serializing on most CPUs that support SSE2. On * CPUs that don't support AMD's leaf, ANDing with the raw host * CPUID will drop the flags, and reporting support in AMD's * leaf can make it easier for userspace to detect the feature. */ SYNTHESIZED_F(LFENCE_RDTSC), /* SmmPgCfgLock */ F(NULL_SEL_CLR_BASE), /* UpperAddressIgnore */ F(AUTOIBRS), F(PREFETCHI), EMULATED_F(NO_SMM_CTL_MSR), /* PrefetchCtlMsr */ /* GpOnUserCpuid */ /* EPSF */ SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), F(SRSO_USER_KERNEL_NO), ); kvm_cpu_cap_init(CPUID_8000_0022_EAX, F(PERFMON_V2), ); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE); kvm_cpu_cap_init(CPUID_C000_0001_EDX, F(XSTORE), F(XSTORE_EN), F(XCRYPT), F(XCRYPT_EN), F(ACE2), F(ACE2_EN), F(PHE), F(PHE_EN), F(PMM), F(PMM_EN), ); /* * Hide RDTSCP and RDPID if either feature is reported as supported but * probing MSR_TSC_AUX failed. This is purely a sanity check and * should never happen, but the guest will likely crash if RDTSCP or * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in * the past. For example, the sanity check may fire if this instance of * KVM is running as L1 on top of an older, broken KVM. */ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) || kvm_cpu_cap_has(X86_FEATURE_RDPID)) && !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) { kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); #undef F #undef SCATTERED_F #undef X86_64_F #undef EMULATED_F #undef SYNTHESIZED_F #undef PASSTHROUGH_F #undef ALIASED_1_EDX_F #undef VENDOR_F #undef RUNTIME_F struct kvm_cpuid_array { struct kvm_cpuid_entry2 *entries; int maxnent; int nent; }; static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array) { if (array->nent >= array->maxnent) return NULL; return &array->entries[array->nent++]; } static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, u32 function, u32 index) { struct kvm_cpuid_entry2 *entry = get_next_cpuid(array); if (!entry) return NULL; memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; switch (function & 0xC0000000) { case 0x40000000: /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ return entry; case 0x80000000: /* * 0x80000021 is sometimes synthesized by __do_cpuid_func, which * would result in out-of-bounds calls to do_host_cpuid. */ { static int max_cpuid_80000000; if (!READ_ONCE(max_cpuid_80000000)) WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000)); if (function > READ_ONCE(max_cpuid_80000000)) return entry; } break; default: break; } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; return entry; } static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, bool include_partially_emulated) { memset(entry, 0, sizeof(*entry)); entry->function = func; entry->index = 0; entry->flags = 0; switch (func) { case 0: entry->eax = 7; return 1; case 1: entry->ecx = feature_bit(MOVBE); /* * KVM allows userspace to enumerate MONITOR+MWAIT support to * the guest, but the MWAIT feature flag is never advertised * to userspace because MONITOR+MWAIT aren't virtualized by * hardware, can't be faithfully emulated in software (KVM * emulates them as NOPs), and allowing the guest to execute * them natively requires enabling a per-VM capability. */ if (include_partially_emulated) entry->ecx |= feature_bit(MWAIT); return 1; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->eax = 0; if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) entry->ecx = feature_bit(RDPID); return 1; default: return 0; } } static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) { if (array->nent >= array->maxnent) return -E2BIG; array->nent += cpuid_func_emulated(&array->entries[array->nent], func, false); return 0; } static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) { struct kvm_cpuid_entry2 *entry; int r, i, max_idx; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); r = -E2BIG; entry = do_host_cpuid(array, function, 0); if (!entry) goto out; switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ entry->eax = min(entry->eax, 0x24U); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); cpuid_entry_override(entry, CPUID_1_ECX); break; case 2: /* * On ancient CPUs, function 2 entries are STATEFUL. That is, * CPUID(function=2, index=0) may return different results each * time, with the least-significant byte in EAX enumerating the * number of times software should do CPUID(2, 0). * * Modern CPUs, i.e. every CPU KVM has *ever* run on are less * idiotic. Intel's SDM states that EAX & 0xff "will always * return 01H. Software should ignore this value and not * interpret it as an informational descriptor", while AMD's * APM states that CPUID(2) is reserved. * * WARN if a frankenstein CPU that supports virtualization and * a stateful CPUID.0x2 is encountered. */ WARN_ON_ONCE((entry->eax & 0xff) > 1); break; /* functions 4 and 0x8000001d have additional index. */ case 4: case 0x8000001d: /* * Read entries until the cache type in the previous entry is * zero, i.e. indicates an invalid entry. */ for (i = 1; entry->eax & 0x1f; ++i) { entry = do_host_cpuid(array, function, i); if (!entry) goto out; } break; case 6: /* Thermal management */ entry->eax = 0x4; /* allow ARAT */ entry->ebx = 0; entry->ecx = 0; entry->edx = 0; break; /* function 7 has additional index. */ case 7: max_idx = entry->eax = min(entry->eax, 2u); cpuid_entry_override(entry, CPUID_7_0_EBX); cpuid_entry_override(entry, CPUID_7_ECX); cpuid_entry_override(entry, CPUID_7_EDX); /* KVM only supports up to 0x7.2, capped above via min(). */ if (max_idx >= 1) { entry = do_host_cpuid(array, function, 1); if (!entry) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; entry->ecx = 0; } if (max_idx >= 2) { entry = do_host_cpuid(array, function, 2); if (!entry) goto out; cpuid_entry_override(entry, CPUID_7_2_EDX); entry->ecx = 0; entry->ebx = 0; entry->eax = 0; } break; case 0xa: { /* Architectural Performance Monitoring */ union cpuid10_eax eax = { }; union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } eax.split.version_id = kvm_pmu_cap.version; eax.split.num_counters = kvm_pmu_cap.num_counters_gp; eax.split.bit_width = kvm_pmu_cap.bit_width_gp; eax.split.mask_length = kvm_pmu_cap.events_mask_len; edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; } case 0x1f: case 0xb: /* * No topology; a valid topology is indicated by the presence * of subleaf 1. */ entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { u64 permitted_xcr0 = kvm_get_filtered_xcr0(); u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); entry->ecx = entry->ebx; entry->edx &= permitted_xcr0 >> 32; if (!permitted_xcr0) break; entry = do_host_cpuid(array, function, 1); if (!entry) goto out; cpuid_entry_override(entry, CPUID_D_1_EAX); if (entry->eax & (feature_bit(XSAVES) | feature_bit(XSAVEC))) entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { WARN_ON_ONCE(permitted_xss != 0); entry->ebx = 0; } entry->ecx &= permitted_xss; entry->edx &= permitted_xss >> 32; for (i = 2; i < 64; ++i) { bool s_state; if (permitted_xcr0 & BIT_ULL(i)) s_state = false; else if (permitted_xss & BIT_ULL(i)) s_state = true; else continue; entry = do_host_cpuid(array, function, i); if (!entry) goto out; /* * The supported check above should have filtered out * invalid sub-leafs. Only valid sub-leafs should * reach this point, and they should have a non-zero * save state size. Furthermore, check whether the * processor agrees with permitted_xcr0/permitted_xss * on whether this is an XCR0- or IA32_XSS-managed area. */ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { --array->nent; continue; } if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) entry->ecx &= ~BIT_ULL(2); entry->edx = 0; } break; } case 0x12: /* Intel SGX */ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } /* * Index 0: Sub-features, MISCSELECT (a.k.a extended features) * and max enclave sizes. The SGX sub-features and MISCSELECT * are restricted by kernel and KVM capabilities (like most * feature flags), while enclave size is unrestricted. */ cpuid_entry_override(entry, CPUID_12_EAX); entry->ebx &= SGX_MISC_EXINFO; entry = do_host_cpuid(array, function, 1); if (!entry) goto out; /* * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la * feature flags. Advertise all supported flags, including * privileged attributes that require explicit opt-in from * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is * expected to derive it from supported XCR0. */ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; entry->ebx &= 0; break; /* Intel PT */ case 0x14: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { if (!do_host_cpuid(array, function, i)) goto out; } break; /* Intel AMX TILE */ case 0x1d: if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { if (!do_host_cpuid(array, function, i)) goto out; } break; case 0x1e: /* TMUL information */ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } break; case 0x24: { u8 avx10_version; if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } /* * The AVX10 version is encoded in EBX[7:0]. Note, the version * is guaranteed to be >=1 if AVX10 is supported. Note #2, the * version needs to be captured before overriding EBX features! */ avx10_version = min_t(u8, entry->ebx & 0xff, 1); cpuid_entry_override(entry, CPUID_24_0_EBX); entry->ebx |= avx10_version; entry->eax = 0; entry->ecx = 0; entry->edx = 0; break; } case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; entry->edx = sigptr[2]; break; } case KVM_CPUID_FEATURES: entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | (1 << KVM_FEATURE_PV_SCHED_YIELD) | (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); entry->ebx = 0; entry->ecx = 0; entry->edx = 0; break; case 0x80000000: entry->eax = min(entry->eax, 0x80000022); /* * Serializing LFENCE is reported in a multitude of ways, and * NullSegClearsBase is not reported in CPUID on Zen2; help * userspace by providing the CPUID leaf ourselves. * * However, only do it if the host has CPUID leaf 0x8000001d. * QEMU thinks that it can query the host blindly for that * CPUID leaf if KVM reports that it supports 0x8000001d or * above. The processor merrily returns values from the * highest Intel leaf which QEMU tries to use as the guest's * 0x8000001d. Even worse, this can result in an infinite * loop if said highest leaf has no subleaves indexed by ECX. */ if (entry->eax >= 0x8000001d && (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) || !static_cpu_has_bug(X86_BUG_NULL_SEG))) entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: entry->ebx &= ~GENMASK(27, 16); cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000005: /* Pass host L1 cache and TLB info. */ break; case 0x80000006: /* Drop reserved bits, pass host L2 cache and TLB info. */ entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ cpuid_entry_override(entry, CPUID_8000_0007_EDX); /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; break; case 0x80000008: { /* * GuestPhysAddrSize (EAX[23:16]) is intended for software * use. * * KVM's ABI is to report the effective MAXPHYADDR for the * guest in PhysAddrSize (phys_as), and the maximum * *addressable* GPA in GuestPhysAddrSize (g_phys_as). * * GuestPhysAddrSize is valid if and only if TDP is enabled, * in which case the max GPA that can be addressed by KVM may * be less than the max GPA that can be legally generated by * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't * support 5-level TDP. */ unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); unsigned int phys_as, g_phys_as; /* * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as * the guest operates in the same PA space as the host, i.e. * reductions in MAXPHYADDR for memory encryption affect shadow * paging, too. * * If TDP is enabled, use the raw bare metal MAXPHYADDR as * reductions to the HPAs do not affect GPAs. The max * addressable GPA is the same as the max effective GPA, except * that it's capped at 48 bits if 5-level TDP isn't supported * (hardware processes bits 51:48 only when walking the fifth * level page table). */ if (!tdp_enabled) { phys_as = boot_cpu_data.x86_phys_bits; g_phys_as = 0; } else { phys_as = entry->eax & 0xff; g_phys_as = phys_as; if (kvm_mmu_get_max_tdp_level() < 5) g_phys_as = min(g_phys_as, 48U); } entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; } case 0x8000000A: if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper ASID emulation to nested SVM */ entry->ecx = 0; /* Reserved */ cpuid_entry_override(entry, CPUID_8000_000A_EDX); break; case 0x80000019: entry->ecx = entry->edx = 0; break; case 0x8000001a: entry->eax &= GENMASK(2, 0); entry->ebx = entry->ecx = entry->edx = 0; break; case 0x8000001e: /* Do not return host topology information. */ entry->eax = entry->ebx = entry->ecx = 0; entry->edx = 0; /* reserved */ break; case 0x8000001F: if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); /* Clear NumVMPL since KVM does not support VMPL. */ entry->ebx &= ~GENMASK(31, 12); /* * Enumerate '0' for "PA bits reduction", the adjusted * MAXPHYADDR is enumerated directly (see 0x80000008). */ entry->ebx &= ~GENMASK(11, 6); } break; case 0x80000020: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: entry->ebx = entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { entry->eax = entry->ebx = 0; break; } cpuid_entry_override(entry, CPUID_8000_0022_EAX); ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; entry->ebx = ebx.full; break; } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: cpuid_entry_override(entry, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 0xC0000002: case 0xC0000003: case 0xC0000004: default: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } r = 0; out: put_cpu(); return r; } static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func, unsigned int type) { if (type == KVM_GET_EMULATED_CPUID) return __do_cpuid_func_emulated(array, func); return __do_cpuid_func(array, func); } #define CENTAUR_CPUID_SIGNATURE 0xC0000000 static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, unsigned int type) { u32 limit; int r; if (func == CENTAUR_CPUID_SIGNATURE && boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) return 0; r = do_cpuid_func(array, func, type); if (r) return r; limit = array->entries[array->nent - 1].eax; for (func = func + 1; func <= limit; ++func) { r = do_cpuid_func(array, func, type); if (r) break; } return r; } static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, __u32 num_entries, unsigned int ioctl_type) { int i; __u32 pad[3]; if (ioctl_type != KVM_GET_EMULATED_CPUID) return false; /* * We want to make sure that ->padding is being passed clean from * userspace in case we want to use it for something in the future. * * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we * have to give ourselves satisfied only with the emulated side. /me * sheds a tear. */ for (i = 0; i < num_entries; i++) { if (copy_from_user(pad, entries[i].padding, sizeof(pad))) return true; if (pad[0] || pad[1] || pad[2]) return true; } return false; } int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type) { static const u32 funcs[] = { 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE, }; struct kvm_cpuid_array array = { .nent = 0, }; int r, i; if (cpuid->nent < 1) return -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL); if (!array.entries) return -ENOMEM; array.maxnent = cpuid->nent; for (i = 0; i < ARRAY_SIZE(funcs); i++) { r = get_cpuid_func(&array, funcs[i], type); if (r) goto out_free; } cpuid->nent = array.nent; if (copy_to_user(entries, array.entries, array.nent * sizeof(struct kvm_cpuid_entry2))) r = -EFAULT; out_free: kvfree(array.entries); return r; } /* * Intel CPUID semantics treats any query for an out-of-range leaf as if the * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics * returns all zeroes for any undefined leaf, whether or not the leaf is in * range. Centaur/VIA follows Intel semantics. * * A leaf is considered out-of-range if its function is higher than the maximum * supported leaf of its associated class or if its associated class does not * exist. * * There are three primary classes to be considered, with their respective * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary * class exists if a guest CPUID entry for its <base> leaf exists. For a given * class, CPUID.<base>.EAX contains the max supported leaf for the class. * * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff * - Hypervisor: 0x40000000 - 0x4fffffff * - Extended: 0x80000000 - 0xbfffffff * - Centaur: 0xc0000000 - 0xcfffffff * * The Hypervisor class is further subdivided into sub-classes that each act as * their own independent class associated with a 0x100 byte range. E.g. if Qemu * is advertising support for both HyperV and KVM, the resulting Hypervisor * CPUID sub-classes are: * * - HyperV: 0x40000000 - 0x400000ff * - KVM: 0x40000100 - 0x400001ff */ static struct kvm_cpuid_entry2 * get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) { struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) || is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx)) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; /* * Leaf specific adjustments are also applied when redirecting to the * max basic entry, e.g. if the max basic leaf is 0xb but there is no * entry for CPUID.0xb.index (see below), then the output value for EDX * needs to be pulled from CPUID.0xb.1. */ *fn_ptr = basic->eax; /* * The class does not exist or the requested function is out of range; * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only) { u32 orig_function = *eax, function = *eax, index = *ecx; struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; if (vcpu->arch.cpuid_dynamic_bits_dirty) kvm_update_cpuid_runtime(vcpu); entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { entry = get_out_of_range_cpuid_entry(vcpu, &function, index); used_max_basic = !!entry; } if (entry) { *eax = entry->eax; *ebx = entry->ebx; *ecx = entry->ecx; *edx = entry->edx; if (function == 7 && index == 0) { u64 data; if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { if (kvm_hv_invtsc_suppressed(vcpu)) *edx &= ~feature_bit(CONSTANT_TSC); } else if (IS_ENABLED(CONFIG_KVM_XEN) && kvm_xen_is_tsc_leaf(vcpu, function)) { /* * Update guest TSC frequency information if necessary. * Ignore failures, there is no sane value that can be * provided if KVM can't get the TSC frequency. */ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) kvm_guest_time_update(vcpu); if (index == 1) { *ecx = vcpu->arch.pvclock_tsc_mul; *edx = vcpu->arch.pvclock_tsc_shift; } else if (index == 2) { *eax = vcpu->arch.hw_tsc_khz; } } } else { *eax = *ebx = *ecx = *edx = 0; /* * When leaf 0BH or 1FH is defined, CL is pass-through * and EDX is always the x2APIC ID, even for undefined * subleaves. Index 1 will exist iff the leaf is * implemented, so we pass through CL iff leaf 1 * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; } } } trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact, used_max_basic); return exact; } EXPORT_SYMBOL_GPL(kvm_cpuid); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) return 1; eax = kvm_rax_read(vcpu); ecx = kvm_rcx_read(vcpu); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); kvm_rax_write(vcpu, eax); kvm_rbx_write(vcpu, ebx); kvm_rcx_write(vcpu, ecx); kvm_rdx_write(vcpu, edx); return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
6 6 6 11 5 6 6 6 227 227 166 8228 8269 8229 8276 536 3071 3078 2490 3079 8191 8209 2466 3017 617 620 616 7310 7328 2197 2395 240 242 234 18 18 9 10 5 5 5 2 2 2 1 1 1 10 8 5 5 2 2 3 3 8 9 2 2 2 16 1 4 4 8 2 7 1 3 1 4 1 2 2 2 2 2 11 5 5 3 3 8 8 2 2 1 1 2 1 5 1 4 3 1 1 7 7 216 228 12 12 6 6 6 6 6 6 4 3 1 9 2 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_up_base(extents, map, id, count); else extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } u32 map_id_up(struct uid_gid_map *map, u32 id) { return map_id_range_up(map, id, 1); } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init);
249 249 249 515 266 248 7 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID support for Linux * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2007-2008 Oliver Neukum * Copyright (c) 2006-2012 Jiri Kosina * Copyright (c) 2012 Henrik Rydberg */ /* */ #include <linux/module.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/unaligned.h> #include <asm/byteorder.h> #include <linux/hid.h> static struct hid_driver hid_generic; static int __check_hid_generic(struct device_driver *drv, void *data) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_device *hdev = data; if (hdrv == &hid_generic) return 0; return hid_match_device(hdev, hdrv) != NULL; } static bool hid_generic_match(struct hid_device *hdev, bool ignore_special_driver) { if (ignore_special_driver) return true; if (hdev->quirks & HID_QUIRK_IGNORE_SPECIAL_DRIVER) return true; if (hdev->quirks & HID_QUIRK_HAVE_SPECIAL_DRIVER) return false; /* * If any other driver wants the device, leave the device to this other * driver. */ if (bus_for_each_drv(&hid_bus_type, NULL, hdev, __check_hid_generic)) return false; return true; } static int hid_generic_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; hdev->quirks |= HID_QUIRK_INPUT_PER_APP; ret = hid_parse(hdev); if (ret) return ret; return hid_hw_start(hdev, HID_CONNECT_DEFAULT); } static const struct hid_device_id hid_table[] = { { HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY, HID_ANY_ID, HID_ANY_ID) }, { } }; MODULE_DEVICE_TABLE(hid, hid_table); static struct hid_driver hid_generic = { .name = "hid-generic", .id_table = hid_table, .match = hid_generic_match, .probe = hid_generic_probe, }; module_hid_driver(hid_generic); MODULE_AUTHOR("Henrik Rydberg"); MODULE_DESCRIPTION("HID generic driver"); MODULE_LICENSE("GPL");
36 2 2 2 2 1 18 13 3 16 2 2 3 1 1 1 3 6 3 7 5 225 213 14 14 10 10 14 252 4 251 2 1 183 33 195 1 184 22 1 37 37 3 6 29 29 9 5 5 33 2 6 2 15 2 5 2 3 5 16 3 2 2 17 13 17 2 12 2 4 3 15 26 25 1 3 1 19 2 1 17 1 10 7 4 66 9 6 33 8 10 2 7 33 6 5 2 1 1 1 10 4 4 2 6 8 30 36 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <linux/bitmap.h> #include "netlink.h" #include "bitset.h" /* Some bitmaps are internally represented as an array of unsigned long, some * as an array of u32 (some even as single u32 for now). To avoid the need of * wrappers on caller side, we provide two set of functions: those with "32" * suffix in their names expect u32 based bitmaps, those without it expect * unsigned long bitmaps. */ static u32 ethnl_lower_bits(unsigned int n) { return ~(u32)0 >> (32 - n % 32); } static u32 ethnl_upper_bits(unsigned int n) { return ~(u32)0 << (n % 32); } /** * ethnl_bitmap32_clear() - Clear u32 based bitmap * @dst: bitmap to clear * @start: beginning of the interval * @end: end of the interval * @mod: set if bitmap was modified * * Clear @nbits bits of a bitmap with indices @start <= i < @end */ static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, bool *mod) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; unsigned int i; u32 mask; if (end <= start) return; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } return; } if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } start_word++; } for (i = start_word; i < end_word; i++) { if (dst[i]) { dst[i] = 0; *mod = true; } } if (end % 32) { mask = ethnl_lower_bits(end); if (dst[end_word] & mask) { dst[end_word] &= ~mask; *mod = true; } } } /** * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval * @map: bitmap to test * @start: beginning of the interval * @end: end of the interval * * Return: true if there is non-zero bit with index @start <= i < @end, * false if the whole interval is zero */ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, unsigned int end) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; u32 mask; if (end <= start) return true; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); return map[start_word] & mask; } if (map[start_word] & mask) return true; start_word++; } if (!memchr_inv(map + start_word, '\0', (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) return true; return map[end_word] & ethnl_lower_bits(end); } /** * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask * pair * @dst: bitmap to update * @nbits: bit size of the bitmap * @value: values to set * @mask: mask of bits to set * @mod: set to true if bitmap is modified, preserve if not * * Set bits in @dst bitmap which are set in @mask to values from @value, leave * the rest untouched. If destination bitmap was modified, set @mod to true, * leave as it is if not. */ static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, const u32 *value, const u32 *mask, bool *mod) { while (nbits > 0) { u32 real_mask = mask ? *mask : ~(u32)0; u32 new_value; if (nbits < 32) real_mask &= ethnl_lower_bits(nbits); new_value = (*dst & ~real_mask) | (*value & real_mask); if (new_value != *dst) { *dst = new_value; *mod = true; } if (nbits <= 32) break; dst++; nbits -= 32; value++; if (mask) mask++; } } static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) { return map[index / 32] & (1U << (index % 32)); } /** * ethnl_bitset32_size() - Calculate size of bitset nested attribute * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: assume compact format for output * * Estimate length of netlink attribute composed by a later call to * ethnl_put_bitset32() call with the same arguments. * * Return: negative error code or attribute length estimate */ int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { unsigned int len = 0; /* list flag */ if (!mask) len += nla_total_size(sizeof(u32)); /* size */ len += nla_total_size(sizeof(u32)); if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); /* value, mask */ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); } else { unsigned int bits_len = 0; unsigned int bit_len, i; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; /* index */ bit_len = nla_total_size(sizeof(u32)); /* name */ if (name) bit_len += ethnl_strz_size(name); /* value */ if (mask && ethnl_bitmap32_test_bit(val, i)) bit_len += nla_total_size(0); /* bit nest */ bits_len += nla_total_size(bit_len); } /* bits nest */ len += nla_total_size(bits_len); } /* outermost nest */ return nla_total_size(len); } /** * ethnl_put_bitset32() - Put a bitset nest into a message * @skb: skb with the message * @attrtype: attribute type for the bitset nest * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: use compact format for the output * * Compose a nested attribute representing a bitset. If @mask is null, simple * bitmap (bit list) is created, if @mask is provided, represent a value/mask * pair. Bit names are only used in verbose mode and when provided by calller. * * Return: 0 on success, negative error value on error */ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { struct nlattr *nest; struct nlattr *attr; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) goto nla_put_failure; if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); u32 *dst; attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, val, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); if (mask) { attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, mask, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); } } else { struct nlattr *bits; unsigned int i; bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); if (!bits) goto nla_put_failure; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); if (!attr) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) goto nla_put_failure; if (name && ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) goto nla_put_failure; if (mask && ethnl_bitmap32_test_bit(val, i) && nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) goto nla_put_failure; nla_nest_end(skb, attr); } nla_nest_end(skb, bits); } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct nla_policy bitset_policy[] = { [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, }; static const struct nla_policy bit_policy[] = { [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, }; /** * ethnl_bitset_is_compact() - check if bitset attribute represents a compact * bitset * @bitset: nested attribute representing a bitset * @compact: pointer for return value * * Return: 0 on success, negative error code on failure */ int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset, bitset_policy, NULL); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) { if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) return -EINVAL; *compact = false; return 0; } if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) return -EINVAL; *compact = true; return 0; } /** * ethnl_name_to_idx() - look up string index for a name * @names: array of ETH_GSTRING_LEN sized strings * @n_names: number of strings in the array * @name: name to look up * * Return: index of the string if found, -ENOENT if not found */ static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, const char *name) { unsigned int i; if (!names) return -ENOENT; for (i = 0; i < n_names; i++) { /* names[i] may not be null terminated */ if (!strncmp(names[i], name, ETH_GSTRING_LEN) && strlen(name) <= ETH_GSTRING_LEN) return i; } return -ENOENT; } static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, const struct nlattr *bit_attr, bool no_mask, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bit_policy)]; int ret, idx; ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr, bit_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { const char *name; idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); if (idx >= nbits) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_INDEX], "bit index too high"); return -EOPNOTSUPP; } name = names ? names[idx] : NULL; if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "bit index and name mismatch"); return -EINVAL; } } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { idx = ethnl_name_to_idx(names, nbits, nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); if (idx < 0) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_NAME], "bit name not found"); return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "neither bit index nor name specified"); return -EINVAL; } *index = idx; *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; return 0; } /** * ethnl_bitmap32_equal() - Compare two bitmaps * @map1: first bitmap * @map2: second bitmap * @nbits: bit size to compare * * Return: true if first @nbits are equal, false if not */ static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2, unsigned int nbits) { if (memcmp(map1, map2, nbits / 32 * sizeof(u32))) return false; if (nbits % 32 == 0) return true; return !((map1[nbits / 32] ^ map2[nbits / 32]) & ethnl_lower_bits(nbits % 32)); } static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 *saved_bitmap = NULL; struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (no_mask) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); bool dummy; /* The bitmap size is only the size of the map part without * its mask part. */ saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL); if (!saved_bitmap) return -ENOMEM; memcpy(saved_bitmap, bitmap, nbytes); ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy); } nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; unsigned int idx; if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); kfree(saved_bitmap); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) { kfree(saved_bitmap); return ret; } old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); if (!no_mask) *mod = true; } } if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits)) *mod = true; kfree(saved_bitmap); return 0; } static int ethnl_compact_sanity_checks(unsigned int nbits, const struct nlattr *nest, struct nlattr **tb, struct netlink_ext_ack *extack) { bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; unsigned int attr_nbits, attr_nwords; const struct nlattr *test_attr; if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask not allowed in list bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_SIZE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing size in compact bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing value in compact bitset"); return -EINVAL; } if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing mask in compact nonlist bitset"); return -EINVAL; } attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); attr_nwords = DIV_ROUND_UP(attr_nbits, 32); if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "bitset value length does not match size"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK] && nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "bitset mask length does not match size"); return -EINVAL; } if (attr_nbits <= nbits) return 0; test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : tb[ETHTOOL_A_BITSET_MASK]; if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { NL_SET_ERR_MSG_ATTR(extack, test_attr, "cannot modify bits past kernel bitset size"); return -EINVAL; } return 0; } /** * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap * @bitmap: bitmap to update * @nbits: size of the updated bitmap in bits * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * @mod: set this to true if bitmap is modified, leave as it is if not * * Apply bitset netsted attribute to a bitmap. If the attribute represents * a bit list, @bitmap is set to its contents; otherwise, bits in mask are * set to values from value. Bitmaps in the attribute may be longer than * @nbits but the message must not request modifying any bits past @nbits. * * Return: negative error code on failure, 0 on success */ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; unsigned int change_bits; bool no_mask; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, names, extack, mod); ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; change_bits = min_t(unsigned int, nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); ethnl_bitmap32_update(bitmap, change_bits, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), no_mask ? NULL : nla_data(tb[ETHTOOL_A_BITSET_MASK]), mod); if (no_mask && change_bits < nbits) ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); return 0; } /** * ethnl_parse_bitset() - Compute effective value and mask from bitset nest * @val: unsigned long based bitmap to put value into * @mask: unsigned long based bitmap to put mask into * @nbits: size of @val and @mask bitmaps * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * * Provide @nbits size long bitmaps for value and mask so that * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x * the same way ethnl_update_bitset() with the same bitset attribute would. * * Return: negative error code on failure, 0 on success */ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; const struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (!tb[ETHTOOL_A_BITSET_BITS]) { unsigned int change_bits; ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); if (change_bits > nbits) change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) bitmap_clear(val, change_bits, nbits - change_bits); if (no_mask) { bitmap_fill(mask, nbits); } else { bitmap_from_arr32(mask, nla_data(tb[ETHTOOL_A_BITSET_MASK]), change_bits); if (change_bits < nbits) bitmap_clear(mask, change_bits, nbits - change_bits); } return 0; } if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } bitmap_zero(val, nbits); if (no_mask) bitmap_fill(mask, nbits); else bitmap_zero(mask, nbits); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { unsigned int idx; bool bit_val; ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; if (bit_val) __set_bit(idx, val); if (!no_mask) __set_bit(idx, mask); } return 0; } #if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) /* 64-bit big endian architectures are the only case when u32 based bitmaps * and unsigned long based bitmaps have different memory layout so that we * cannot simply cast the latter to the former and need actual wrappers * converting the latter to the former. * * To reduce the number of slab allocations, the wrappers use fixed size local * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the * majority of bitmaps used by ethtool. */ #define ETHNL_SMALL_BITMAP_BITS 128 #define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; u32 *bitmap32 = small_bitmap32; bool u32_mod = false; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int dst_words = DIV_ROUND_UP(nbits, 32); bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); if (!bitmap32) return -ENOMEM; } bitmap_to_arr32(bitmap32, bitmap, nbits); ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, &u32_mod); if (u32_mod) { bitmap_from_arr32(bitmap, bitmap32, nbits); *mod = true; } if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(bitmap32); return ret; } #else /* On little endian 64-bit and all 32-bit architectures, an unsigned long * based bitmap can be interpreted as u32 based one using a simple cast. */ int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, mod); } #endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
2 19 163 330 330 323 323 36 54 54 50 7 36 21 21 53 21 94 88 8 59 29 36 58 55 6 52 52 64 64 64 7 22 8 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* SPDX-License-Identifier: GPL-1.0+ */ /* * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. * * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes * NCM: Network and Communications Management, Inc. * * BUT, I'm the one who modified it for ethernet, so: * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov * */ #ifndef _NET_BONDING_H #define _NET_BONDING_H #include <linux/timer.h> #include <linux/proc_fs.h> #include <linux/if_bonding.h> #include <linux/cpumask.h> #include <linux/in6.h> #include <linux/netpoll.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/reciprocal_div.h> #include <linux/if_link.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #include <net/bond_options.h> #include <net/ipv6.h> #include <net/addrconf.h> #define BOND_MAX_ARP_TARGETS 16 #define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 #ifndef __long_aligned #define __long_aligned __attribute__((aligned((sizeof(long))))) #endif #define slave_info(bond_dev, slave_dev, fmt, ...) \ netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_warn(bond_dev, slave_dev, fmt, ...) \ netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_dbg(bond_dev, slave_dev, fmt, ...) \ netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_err(bond_dev, slave_dev, fmt, ...) \ netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define BOND_MODE(bond) ((bond)->params.mode) /* slave list primitives */ #define bond_slave_list(bond) (&(bond)->dev->adj_list.lower) #define bond_has_slaves(bond) !list_empty(bond_slave_list(bond)) /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */ #define bond_first_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->next) : \ NULL) #define bond_last_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \ NULL) /* Caller must have rcu_read_lock */ #define bond_first_slave_rcu(bond) \ netdev_lower_get_first_private_rcu(bond->dev) #define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond)) #define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond)) /** * bond_for_each_slave - iterate over all slaves * @bond: the bond holding this list * @pos: current slave * @iter: list_head * iterator * * Caller must hold RTNL */ #define bond_for_each_slave(bond, pos, iter) \ netdev_for_each_lower_private((bond)->dev, pos, iter) /* Caller must have rcu_read_lock */ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; static inline void block_netpoll_tx(void) { atomic_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { atomic_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { if (unlikely(netpoll_tx_running(dev))) return atomic_read(&netpoll_block_tx); return 0; } #else #define block_netpoll_tx() #define unblock_netpoll_tx() #define is_netpoll_tx_blocked(dev) (0) #endif struct bond_params { int mode; int xmit_policy; int miimon; u8 num_peer_notif; u8 missed_max; int arp_interval; int arp_validate; int arp_all_targets; int use_carrier; int fail_over_mac; int updelay; int downdelay; int peer_notif_delay; int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; char primary[IFNAMSIZ]; int primary_reselect; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; int tx_queues; int all_slaves_active; int resend_igmp; int lp_interval; int packets_per_slave; int tlb_dynamic_lb; struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; }; struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; /* all 4 in jiffies */ unsigned long last_link_up; unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif struct delayed_work notify_work; struct kobject kobj; struct rtnl_link_stats64 slave_stats; }; static inline struct slave *to_slave(struct kobject *kobj) { return container_of(kobj, struct slave, kobj); } struct bond_up_slave { unsigned int count; struct rcu_head rcu; struct slave *arr[]; }; /* * Link pseudo-state only used internally by monitors */ #define BOND_LINK_NOCHANGE -1 struct bond_ipsec { struct list_head list; struct xfrm_state *xs; }; /* * Here are the locking policies for the two bonding locks: * Get rcu_read_lock when reading or RTNL when writing slave list. */ struct bonding { struct net_device *dev; /* first - useful for panic debug */ struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); /* mode_lock is used for mode-specific locking needs, currently used by: * 3ad mode (4) - protect against running bond_3ad_unbind_slave() and * bond_3ad_state_machine_handler() concurrently and also * the access to the state machine shared variables. * TLB mode (5) - to sync the use and modifications of its hash table * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; spinlock_t stats_lock; u32 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; struct workqueue_struct *wq; struct delayed_work mii_work; struct delayed_work arp_work; struct delayed_work alb_work; struct delayed_work ad_work; struct delayed_work mcast_work; struct delayed_work slave_arr_work; #ifdef CONFIG_DEBUG_FS /* debugging support via debugfs */ struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; #define bond_slave_get_rcu(dev) \ ((struct slave *) rcu_dereference(dev->rx_handler_data)) #define bond_slave_get_rtnl(dev) \ ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; unsigned short vlan_id; }; /* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read */ static inline struct slave *bond_get_slave_by_dev(struct bonding *bond, struct net_device *slave_dev) { return netdev_lower_dev_get_private(bond->dev, slave_dev); } static inline struct bonding *bond_get_bond_by_slave(struct slave *slave) { return slave->bond; } static inline bool bond_should_override_tx_queue(struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN; } static inline bool bond_is_lb(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB; } static inline bool bond_needs_speed_duplex(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); } static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); } static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || bond_is_nondyn_tlb(bond)); } static inline bool bond_mode_uses_arp(int mode) { return mode != BOND_MODE_8023AD && mode != BOND_MODE_TLB && mode != BOND_MODE_ALB; } static inline bool bond_mode_uses_primary(int mode) { return mode == BOND_MODE_ACTIVEBACKUP || mode == BOND_MODE_TLB || mode == BOND_MODE_ALB; } static inline bool bond_uses_primary(struct bonding *bond) { return bond_mode_uses_primary(BOND_MODE(bond)); } static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } static inline bool bond_slave_is_up(struct slave *slave) { return netif_running(slave->dev) && netif_carrier_ok(slave->dev); } static inline void bond_set_active_slave(struct slave *slave) { if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_backup_slave(struct slave *slave) { if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_slave_state(struct slave *slave, int slave_state, bool notify) { if (slave->backup == slave_state) return; slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); bond_queue_slave_event(slave); slave->should_notify = 0; } else { if (slave->should_notify) slave->should_notify = 0; else slave->should_notify = 1; } } static inline void bond_slave_state_change(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->link == BOND_LINK_UP) bond_set_active_slave(tmp); else if (tmp->link == BOND_LINK_DOWN) bond_set_backup_slave(tmp); } } static inline void bond_slave_state_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); tmp->should_notify = 0; } } } static inline int bond_slave_state(struct slave *slave) { return slave->backup; } static inline bool bond_is_active_slave(struct slave *slave) { return !bond_slave_state(slave); } static inline bool bond_slave_can_tx(struct slave *slave) { return bond_slave_is_up(slave) && slave->link == BOND_LINK_UP && bond_is_active_slave(slave); } static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) { struct slave *slave; bool active; rcu_read_lock(); slave = bond_slave_get_rcu(slave_dev); active = bond_is_active_slave(slave); rcu_read_unlock(); return active; } static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { ether_addr_copy(dst, src); return; } memcpy(dst, src, len); } #define BOND_PRI_RESELECT_ALWAYS 0 #define BOND_PRI_RESELECT_BETTER 1 #define BOND_PRI_RESELECT_FAILURE 2 #define BOND_FOM_NONE 0 #define BOND_FOM_ACTIVE 1 #define BOND_FOM_FOLLOW 2 #define BOND_ARP_TARGETS_ANY 0 #define BOND_ARP_TARGETS_ALL 1 #define BOND_ARP_VALIDATE_NONE 0 #define BOND_ARP_VALIDATE_ACTIVE (1 << BOND_STATE_ACTIVE) #define BOND_ARP_VALIDATE_BACKUP (1 << BOND_STATE_BACKUP) #define BOND_ARP_VALIDATE_ALL (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_VALIDATE_BACKUP) #define BOND_ARP_FILTER (BOND_ARP_VALIDATE_ALL + 1) #define BOND_ARP_FILTER_ACTIVE (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_FILTER) #define BOND_ARP_FILTER_BACKUP (BOND_ARP_VALIDATE_BACKUP | \ BOND_ARP_FILTER) #define BOND_SLAVE_NOTIFY_NOW true #define BOND_SLAVE_NOTIFY_LATER false static inline int slave_do_arp_validate(struct bonding *bond, struct slave *slave) { return bond->params.arp_validate & (1 << bond_slave_state(slave)); } static inline int slave_do_arp_validate_only(struct bonding *bond) { return bond->params.arp_validate & BOND_ARP_FILTER; } static inline int bond_is_ip_target_ok(__be32 addr) { return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_is_ip6_target_ok(struct in6_addr *addr) { return !ipv6_addr_any(addr) && !ipv6_addr_loopback(addr) && !ipv6_addr_is_multicast(addr); } #endif /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, struct slave *slave) { int i = 1; unsigned long ret = slave->target_last_arp_rx[0]; for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) if (time_before(slave->target_last_arp_rx[i], ret)) ret = slave->target_last_arp_rx[i]; return ret; } static inline unsigned long slave_last_rx(struct bonding *bond, struct slave *slave) { if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) return slave_oldest_target_arp_rx(bond, slave); return slave->last_rx; } static inline void slave_update_last_tx(struct slave *slave) { WRITE_ONCE(slave->last_tx, jiffies); } static inline unsigned long slave_last_tx(struct slave *slave) { return READ_ONCE(slave->last_tx); } #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { return netpoll_send_skb(slave->np, skb); } #else static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { BUG(); return NETDEV_TX_OK; } #endif static inline void bond_set_slave_inactive_flags(struct slave *slave, bool notify) { if (!bond_is_lb(slave->bond)) bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 1; } static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 0; } static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, bool notify) { slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) { return slave->inactive; } static inline bool bond_is_slave_rx_disabled(struct slave *slave) { return slave->rx_disabled; } static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; } static inline void bond_commit_link_state(struct slave *slave, bool notify) { if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) slave->should_notify_link = 0; else slave->should_notify_link = 1; } } static inline void bond_set_slave_link_state(struct slave *slave, int state, bool notify) { bond_propose_link_state(slave, state); bond_commit_link_state(slave, notify); } static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) { struct in_device *in_dev; __be32 addr = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) addr = inet_confirm_addr(dev_net(dev), in_dev, dst, local, RT_SCOPE_HOST); rcu_read_unlock(); return addr; } struct bond_net { struct net *net; /* Associated network namespace */ struct list_head dev_list; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_dir; #endif struct class_attribute class_attr_bonding_masters; }; int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); void bond_xdp_set_features(struct net_device *bond_dev); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); void bond_select_active_slave(struct bonding *bond); void bond_change_active_slave(struct bonding *bond, struct slave *new_active); void bond_create_debugfs(void); void bond_destroy_debugfs(void); void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); void bond_netlink_fini(void); struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond); const char *bond_slave_link_status(s8 link); struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); void bond_remove_proc_entry(struct bonding *bond); void bond_create_proc_dir(struct bond_net *bn); void bond_destroy_proc_dir(struct bond_net *bn); #else static inline void bond_create_proc_entry(struct bonding *bond) { } static inline void bond_remove_proc_entry(struct bonding *bond) { } static inline void bond_create_proc_dir(struct bond_net *bn) { } static inline void bond_destroy_proc_dir(struct bond_net *bn) { } #endif static inline struct slave *bond_slave_has_mac(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return tmp; return NULL; } /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; return false; } /* Check if the ip is present in arp ip list, or first free slot if ip == 0 * Returns -1 if not found, index if found */ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) if (targets[i] == ip) return i; else if (targets[i] == 0) break; return -1; } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { struct in6_addr mcaddr; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { addrconf_addr_solict_mult(&targets[i], &mcaddr); if ((ipv6_addr_equal(&targets[i], ip)) || (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; } return -1; } #endif /* exported from bond_main.c */ extern unsigned int bond_net_id; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; /* exported from bond_sysfs_slave.c */ extern const struct sysfs_ops slave_sysfs_ops; /* exported from bond_3ad.c */ extern const u8 lacpdu_mcast_addr[]; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } #endif /* _NET_BONDING_H */
4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * i2c.h - definitions for the Linux i2c bus interface * Copyright (C) 1995-2000 Simon G. Vogl * Copyright (C) 2013-2019 Wolfram Sang <wsa@kernel.org> * * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and * Frodo Looijaard <frodol@dds.nl> */ #ifndef _LINUX_I2C_H #define _LINUX_I2C_H #include <linux/acpi.h> /* for acpi_handle */ #include <linux/bits.h> #include <linux/mod_devicetable.h> #include <linux/device.h> /* for struct device */ #include <linux/sched.h> /* for completion */ #include <linux/mutex.h> #include <linux/regulator/consumer.h> #include <linux/rtmutex.h> #include <linux/irqdomain.h> /* for Host Notify IRQ */ #include <linux/of.h> /* for struct device_node */ #include <linux/swab.h> /* for swab16 */ #include <uapi/linux/i2c.h> extern const struct bus_type i2c_bus_type; extern const struct device_type i2c_adapter_type; extern const struct device_type i2c_client_type; /* --- General options ------------------------------------------------ */ struct i2c_msg; struct i2c_adapter; struct i2c_client; struct i2c_driver; struct i2c_device_identity; union i2c_smbus_data; struct i2c_board_info; enum i2c_slave_event; typedef int (*i2c_slave_cb_t)(struct i2c_client *client, enum i2c_slave_event event, u8 *val); /* I2C Frequency Modes */ #define I2C_MAX_STANDARD_MODE_FREQ 100000 #define I2C_MAX_FAST_MODE_FREQ 400000 #define I2C_MAX_FAST_MODE_PLUS_FREQ 1000000 #define I2C_MAX_TURBO_MODE_FREQ 1400000 #define I2C_MAX_HIGH_SPEED_MODE_FREQ 3400000 #define I2C_MAX_ULTRA_FAST_MODE_FREQ 5000000 struct module; struct property_entry; #if IS_ENABLED(CONFIG_I2C) /* Return the Frequency mode string based on the bus frequency */ const char *i2c_freq_mode_string(u32 bus_freq_hz); /* * The master routines are the ones normally used to transmit data to devices * on a bus (or read from them). Apart from two basic transfer functions to * transmit one message at a time, a more complex version can be used to * transmit an arbitrary number of messages without interruption. * @count must be less than 64k since msg.len is u16. */ int i2c_transfer_buffer_flags(const struct i2c_client *client, char *buf, int count, u16 flags); /** * i2c_master_recv - issue a single I2C message in master receive mode * @client: Handle to slave device * @buf: Where to store data read from slave * @count: How many bytes to read, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes read. */ static inline int i2c_master_recv(const struct i2c_client *client, char *buf, int count) { return i2c_transfer_buffer_flags(client, buf, count, I2C_M_RD); }; /** * i2c_master_recv_dmasafe - issue a single I2C message in master receive mode * using a DMA safe buffer * @client: Handle to slave device * @buf: Where to store data read from slave, must be safe to use with DMA * @count: How many bytes to read, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes read. */ static inline int i2c_master_recv_dmasafe(const struct i2c_client *client, char *buf, int count) { return i2c_transfer_buffer_flags(client, buf, count, I2C_M_RD | I2C_M_DMA_SAFE); }; /** * i2c_master_send - issue a single I2C message in master transmit mode * @client: Handle to slave device * @buf: Data that will be written to the slave * @count: How many bytes to write, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes written. */ static inline int i2c_master_send(const struct i2c_client *client, const char *buf, int count) { return i2c_transfer_buffer_flags(client, (char *)buf, count, 0); }; /** * i2c_master_send_dmasafe - issue a single I2C message in master transmit mode * using a DMA safe buffer * @client: Handle to slave device * @buf: Data that will be written to the slave, must be safe to use with DMA * @count: How many bytes to write, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes written. */ static inline int i2c_master_send_dmasafe(const struct i2c_client *client, const char *buf, int count) { return i2c_transfer_buffer_flags(client, (char *)buf, count, I2C_M_DMA_SAFE); }; /* Transfer num messages. */ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); /* Unlocked flavor */ int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); /* This is the very generalized SMBus access routine. You probably do not want to use this, though; one of the functions below may be much easier, and probably just as fast. Note that we use i2c_adapter here, because you do not need a specific smbus adapter to call this function. */ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int protocol, union i2c_smbus_data *data); /* Unlocked flavor */ s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int protocol, union i2c_smbus_data *data); /* Now follow the 'nice' access routines. These also document the calling conventions of i2c_smbus_xfer. */ u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count); s32 i2c_smbus_read_byte(const struct i2c_client *client); s32 i2c_smbus_write_byte(const struct i2c_client *client, u8 value); s32 i2c_smbus_read_byte_data(const struct i2c_client *client, u8 command); s32 i2c_smbus_write_byte_data(const struct i2c_client *client, u8 command, u8 value); s32 i2c_smbus_read_word_data(const struct i2c_client *client, u8 command); s32 i2c_smbus_write_word_data(const struct i2c_client *client, u8 command, u16 value); static inline s32 i2c_smbus_read_word_swapped(const struct i2c_client *client, u8 command) { s32 value = i2c_smbus_read_word_data(client, command); return (value < 0) ? value : swab16(value); } static inline s32 i2c_smbus_write_word_swapped(const struct i2c_client *client, u8 command, u16 value) { return i2c_smbus_write_word_data(client, command, swab16(value)); } /* Returns the number of read bytes */ s32 i2c_smbus_read_block_data(const struct i2c_client *client, u8 command, u8 *values); s32 i2c_smbus_write_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values); /* Returns the number of read bytes */ s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client, u8 command, u8 length, u8 *values); s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values); s32 i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client, u8 command, u8 length, u8 *values); int i2c_get_device_id(const struct i2c_client *client, struct i2c_device_identity *id); const struct i2c_device_id *i2c_client_get_device_id(const struct i2c_client *client); #endif /* I2C */ /** * struct i2c_device_identity - i2c client device identification * @manufacturer_id: 0 - 4095, database maintained by NXP * @part_id: 0 - 511, according to manufacturer * @die_revision: 0 - 7, according to manufacturer */ struct i2c_device_identity { u16 manufacturer_id; #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS 0 #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS_1 1 #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS_2 2 #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS_3 3 #define I2C_DEVICE_ID_RAMTRON_INTERNATIONAL 4 #define I2C_DEVICE_ID_ANALOG_DEVICES 5 #define I2C_DEVICE_ID_STMICROELECTRONICS 6 #define I2C_DEVICE_ID_ON_SEMICONDUCTOR 7 #define I2C_DEVICE_ID_SPRINTEK_CORPORATION 8 #define I2C_DEVICE_ID_ESPROS_PHOTONICS_AG 9 #define I2C_DEVICE_ID_FUJITSU_SEMICONDUCTOR 10 #define I2C_DEVICE_ID_FLIR 11 #define I2C_DEVICE_ID_O2MICRO 12 #define I2C_DEVICE_ID_ATMEL 13 #define I2C_DEVICE_ID_NONE 0xffff u16 part_id; u8 die_revision; }; enum i2c_alert_protocol { I2C_PROTOCOL_SMBUS_ALERT, I2C_PROTOCOL_SMBUS_HOST_NOTIFY, }; /** * enum i2c_driver_flags - Flags for an I2C device driver * * @I2C_DRV_ACPI_WAIVE_D0_PROBE: Don't put the device in D0 state for probe */ enum i2c_driver_flags { I2C_DRV_ACPI_WAIVE_D0_PROBE = BIT(0), }; /** * struct i2c_driver - represent an I2C device driver * @class: What kind of i2c device we instantiate (for detect) * @probe: Callback for device binding * @remove: Callback for device unbinding * @shutdown: Callback for device shutdown * @alert: Alert callback, for example for the SMBus alert protocol * @command: Callback for bus-wide signaling (optional) * @driver: Device driver model driver * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. * The driver.name field should be set to the name of this driver. * * For automatic device detection, both @detect and @address_list must * be defined. @class should also be set, otherwise only devices forced * with module parameters will be created. The detect function must * fill at least the name field of the i2c_board_info structure it is * handed upon successful detection, and possibly also the flags field. * * If @detect is missing, the driver will still work fine for enumerated * devices. Detected devices simply won't be supported. This is expected * for the many I2C/SMBus devices which can't be detected reliably, and * the ones which can always be enumerated in practice. * * The i2c_client structure which is handed to the @detect callback is * not a real i2c_client. It is initialized just enough so that you can * call i2c_smbus_read_byte_data and friends on it. Don't do anything * else with it. In particular, calling dev_dbg and friends on it is * not allowed. */ struct i2c_driver { unsigned int class; /* Standard driver model interfaces */ int (*probe)(struct i2c_client *client); void (*remove)(struct i2c_client *client); /* driver model interfaces that don't relate to enumeration */ void (*shutdown)(struct i2c_client *client); /* Alert callback, for example for the SMBus alert protocol. * The format and meaning of the data value depends on the protocol. * For the SMBus alert protocol, there is a single bit of data passed * as the alert response's low bit ("event flag"). * For the SMBus Host Notify protocol, the data corresponds to the * 16-bit payload data reported by the slave device acting as master. */ void (*alert)(struct i2c_client *client, enum i2c_alert_protocol protocol, unsigned int data); /* a ioctl like command that can be used to perform specific functions * with the device. */ int (*command)(struct i2c_client *client, unsigned int cmd, void *arg); struct device_driver driver; const struct i2c_device_id *id_table; /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; struct list_head clients; u32 flags; }; #define to_i2c_driver(d) container_of_const(d, struct i2c_driver, driver) /** * struct i2c_client - represent an I2C slave device * @flags: see I2C_CLIENT_* for possible flags * @addr: Address used on the I2C bus connected to the parent adapter. * @name: Indicates the type of the device, usually a chip name that's * generic enough to hide second-sourcing and compatible revisions. * @adapter: manages the bus segment hosting this I2C device * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) * @detected: member of an i2c_driver.clients list or i2c-core's * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources * acquired when probing this device. * @debugfs: pointer to the debugfs subdirectory which the I2C core created * for this client. * * An i2c_client identifies a single device (i.e. chip) connected to an * i2c bus. The behaviour exposed to Linux is defined by the driver * managing the device. */ struct i2c_client { unsigned short flags; /* div., see below */ #define I2C_CLIENT_PEC 0x04 /* Use Packet Error Checking */ #define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ /* Must equal I2C_M_TEN below */ #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ unsigned short addr; /* chip address - NOTE: 7bit */ /* addresses are stored in the */ /* _LOWER_ 7 bits */ char name[I2C_NAME_SIZE]; struct i2c_adapter *adapter; /* the adapter we sit on */ struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif void *devres_group_id; /* ID of probe devres group */ struct dentry *debugfs; /* per-client debugfs dir */ }; #define to_i2c_client(d) container_of(d, struct i2c_client, dev) struct i2c_adapter *i2c_verify_adapter(struct device *dev); const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id, const struct i2c_client *client); const void *i2c_get_match_data(const struct i2c_client *client); static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj) { struct device * const dev = kobj_to_dev(kobj); return to_i2c_client(dev); } static inline void *i2c_get_clientdata(const struct i2c_client *client) { return dev_get_drvdata(&client->dev); } static inline void i2c_set_clientdata(struct i2c_client *client, void *data) { dev_set_drvdata(&client->dev, data); } /* I2C slave support */ enum i2c_slave_event { I2C_SLAVE_READ_REQUESTED, I2C_SLAVE_WRITE_REQUESTED, I2C_SLAVE_READ_PROCESSED, I2C_SLAVE_WRITE_RECEIVED, I2C_SLAVE_STOP, }; int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb); int i2c_slave_unregister(struct i2c_client *client); int i2c_slave_event(struct i2c_client *client, enum i2c_slave_event event, u8 *val); #if IS_ENABLED(CONFIG_I2C_SLAVE) bool i2c_detect_slave_mode(struct device *dev); #else static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } #endif /** * struct i2c_board_info - template for device creation * @type: chip type, to initialize i2c_client.name * @flags: to initialize i2c_client.flags * @addr: stored in i2c_client.addr * @dev_name: Overrides the default <busnr>-<addr> dev_name if set * @platform_data: stored in i2c_client.dev.platform_data * @fwnode: device node supplied by the platform firmware * @swnode: software node for the device * @resources: resources associated with the device * @num_resources: number of resources in the @resources array * @irq: stored in i2c_client.irq * * I2C doesn't actually support hardware probing, although controllers and * devices may be able to use I2C_SMBUS_QUICK to tell whether or not there's * a device at a given address. Drivers commonly need more information than * that, such as chip type, configuration, associated IRQ, and so on. * * i2c_board_info is used to build tables of information listing I2C devices * that are present. This information is used to grow the driver model tree. * For mainboards this is done statically using i2c_register_board_info(); * bus numbers identify adapters that aren't yet available. For add-on boards, * i2c_new_client_device() does this dynamically with the adapter already known. */ struct i2c_board_info { char type[I2C_NAME_SIZE]; unsigned short flags; unsigned short addr; const char *dev_name; void *platform_data; struct fwnode_handle *fwnode; const struct software_node *swnode; const struct resource *resources; unsigned int num_resources; int irq; }; /** * I2C_BOARD_INFO - macro used to list an i2c device and its address * @dev_type: identifies the device type * @dev_addr: the device's address on the bus. * * This macro initializes essential fields of a struct i2c_board_info, * declaring what has been provided on a particular board. Optional * fields (such as associated irq, or device-specific platform_data) * are provided using conventional syntax. */ #define I2C_BOARD_INFO(dev_type, dev_addr) \ .type = dev_type, .addr = (dev_addr) #if IS_ENABLED(CONFIG_I2C) /* * Add-on boards should register/unregister their devices; e.g. a board * with integrated I2C, a config eeprom, sensors, and a codec that's * used in conjunction with the primary hardware. */ struct i2c_client * i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info); /* If you don't know the exact address of an I2C device, use this variant * instead, which can probe for device presence in a list of possible * addresses. The "probe" callback function is optional. If it is provided, * it must return 1 on successful probe, 0 otherwise. If it is not provided, * a default probing method is used. */ struct i2c_client * i2c_new_scanned_device(struct i2c_adapter *adap, struct i2c_board_info *info, unsigned short const *addr_list, int (*probe)(struct i2c_adapter *adap, unsigned short addr)); /* Common custom probe functions */ int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr); struct i2c_client * i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address); struct i2c_client * devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address); struct i2c_client * i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr); void i2c_unregister_device(struct i2c_client *client); struct i2c_client *i2c_verify_client(struct device *dev); #else static inline struct i2c_client *i2c_verify_client(struct device *dev) { return NULL; } #endif /* I2C */ /* Mainboard arch_initcall() code should register all its I2C devices. * This is done at arch_initcall time, before declaring any i2c adapters. * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else static inline int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n) { return 0; } #endif /* I2C_BOARDINFO */ /** * struct i2c_algorithm - represent I2C transfer methods * @xfer: Transfer a given number of messages defined by the msgs array via * the specified adapter. * @xfer_atomic: Same as @xfer. Yet, only using atomic context so e.g. PMICs * can be accessed very late before shutdown. Optional. * @smbus_xfer: Issue SMBus transactions to the given I2C adapter. If this * is not present, then the bus layer will try and convert the SMBus calls * into I2C transfers instead. * @smbus_xfer_atomic: Same as @smbus_xfer. Yet, only using atomic context * so e.g. PMICs can be accessed very late before shutdown. Optional. * @functionality: Return the flags that this algorithm/adapter pair supports * from the ``I2C_FUNC_*`` flags. * @reg_target: Register given client to local target mode of this adapter * @unreg_target: Unregister given client from local target mode of this adapter * * @master_xfer: deprecated, use @xfer * @master_xfer_atomic: deprecated, use @xfer_atomic * @reg_slave: deprecated, use @reg_target * @unreg_slave: deprecated, use @unreg_target * * i2c_algorithm is the interface to a class of hardware solutions which can * be addressed using the same bus algorithms - i.e. bit-banging or the PCF8584 * to name two of the most common. * * The return codes from the ``xfer{_atomic}`` fields should indicate the * type of error code that occurred during the transfer, as documented in the * Kernel Documentation file Documentation/i2c/fault-codes.rst. Otherwise, the * number of messages executed should be returned. */ struct i2c_algorithm { /* * If an adapter algorithm can't do I2C-level access, set xfer * to NULL. If an adapter algorithm can do SMBus access, set * smbus_xfer. If set to NULL, the SMBus protocol is simulated * using common I2C messages. */ union { int (*xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); int (*master_xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); }; union { int (*xfer_atomic)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); int (*master_xfer_atomic)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); }; int (*smbus_xfer)(struct i2c_adapter *adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data); int (*smbus_xfer_atomic)(struct i2c_adapter *adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data); /* To determine what the adapter supports */ u32 (*functionality)(struct i2c_adapter *adap); #if IS_ENABLED(CONFIG_I2C_SLAVE) union { int (*reg_target)(struct i2c_client *client); int (*reg_slave)(struct i2c_client *client); }; union { int (*unreg_target)(struct i2c_client *client); int (*unreg_slave)(struct i2c_client *client); }; #endif }; /** * struct i2c_lock_operations - represent I2C locking operations * @lock_bus: Get exclusive access to an I2C bus segment * @trylock_bus: Try to get exclusive access to an I2C bus segment * @unlock_bus: Release exclusive access to an I2C bus segment * * The main operations are wrapped by i2c_lock_bus and i2c_unlock_bus. */ struct i2c_lock_operations { void (*lock_bus)(struct i2c_adapter *adapter, unsigned int flags); int (*trylock_bus)(struct i2c_adapter *adapter, unsigned int flags); void (*unlock_bus)(struct i2c_adapter *adapter, unsigned int flags); }; /** * struct i2c_timings - I2C timing information * @bus_freq_hz: the bus frequency in Hz * @scl_rise_ns: time SCL signal takes to rise in ns; t(r) in the I2C specification * @scl_fall_ns: time SCL signal takes to fall in ns; t(f) in the I2C specification * @scl_int_delay_ns: time IP core additionally needs to setup SCL in ns * @sda_fall_ns: time SDA signal takes to fall in ns; t(f) in the I2C specification * @sda_hold_ns: time IP core additionally needs to hold SDA in ns * @digital_filter_width_ns: width in ns of spikes on i2c lines that the IP core * digital filter can filter out * @analog_filter_cutoff_freq_hz: threshold frequency for the low pass IP core * analog filter */ struct i2c_timings { u32 bus_freq_hz; u32 scl_rise_ns; u32 scl_fall_ns; u32 scl_int_delay_ns; u32 sda_fall_ns; u32 sda_hold_ns; u32 digital_filter_width_ns; u32 analog_filter_cutoff_freq_hz; }; /** * struct i2c_bus_recovery_info - I2C bus recovery information * @recover_bus: Recover routine. Either pass driver's recover_bus() routine, or * i2c_generic_scl_recovery(). * @get_scl: This gets current value of SCL line. Mandatory for generic SCL * recovery. Populated internally for generic GPIO recovery. * @set_scl: This sets/clears the SCL line. Mandatory for generic SCL recovery. * Populated internally for generic GPIO recovery. * @get_sda: This gets current value of SDA line. This or set_sda() is mandatory * for generic SCL recovery. Populated internally, if sda_gpio is a valid * GPIO, for generic GPIO recovery. * @set_sda: This sets/clears the SDA line. This or get_sda() is mandatory for * generic SCL recovery. Populated internally, if sda_gpio is a valid GPIO, * for generic GPIO recovery. * @get_bus_free: Returns the bus free state as seen from the IP core in case it * has a more complex internal logic than just reading SDA. Optional. * @prepare_recovery: This will be called before starting recovery. Platform may * configure padmux here for SDA/SCL line or something else they want. * @unprepare_recovery: This will be called after completing recovery. Platform * may configure padmux here for SDA/SCL line or something else they want. * @scl_gpiod: gpiod of the SCL line. Only required for GPIO recovery. * @sda_gpiod: gpiod of the SDA line. Only required for GPIO recovery. * @pinctrl: pinctrl used by GPIO recovery to change the state of the I2C pins. * Optional. * @pins_default: default pinctrl state of SCL/SDA lines, when they are assigned * to the I2C bus. Optional. Populated internally for GPIO recovery, if * state with the name PINCTRL_STATE_DEFAULT is found and pinctrl is valid. * @pins_gpio: recovery pinctrl state of SCL/SDA lines, when they are used as * GPIOs. Optional. Populated internally for GPIO recovery, if this state * is called "gpio" or "recovery" and pinctrl is valid. */ struct i2c_bus_recovery_info { int (*recover_bus)(struct i2c_adapter *adap); int (*get_scl)(struct i2c_adapter *adap); void (*set_scl)(struct i2c_adapter *adap, int val); int (*get_sda)(struct i2c_adapter *adap); void (*set_sda)(struct i2c_adapter *adap, int val); int (*get_bus_free)(struct i2c_adapter *adap); void (*prepare_recovery)(struct i2c_adapter *adap); void (*unprepare_recovery)(struct i2c_adapter *adap); /* gpio recovery */ struct gpio_desc *scl_gpiod; struct gpio_desc *sda_gpiod; struct pinctrl *pinctrl; struct pinctrl_state *pins_default; struct pinctrl_state *pins_gpio; }; int i2c_recover_bus(struct i2c_adapter *adap); /* Generic recovery routines */ int i2c_generic_scl_recovery(struct i2c_adapter *adap); /** * struct i2c_adapter_quirks - describe flaws of an i2c adapter * @flags: see I2C_AQ_* for possible flags and read below * @max_num_msgs: maximum number of messages per transfer * @max_write_len: maximum length of a write message * @max_read_len: maximum length of a read message * @max_comb_1st_msg_len: maximum length of the first msg in a combined message * @max_comb_2nd_msg_len: maximum length of the second msg in a combined message * * Note about combined messages: Some I2C controllers can only send one message * per transfer, plus something called combined message or write-then-read. * This is (usually) a small write message followed by a read message and * barely enough to access register based devices like EEPROMs. There is a flag * to support this mode. It implies max_num_msg = 2 and does the length checks * with max_comb_*_len because combined message mode usually has its own * limitations. Because of HW implementations, some controllers can actually do * write-then-anything or other variants. To support that, write-then-read has * been broken out into smaller bits like write-first and read-second which can * be combined as needed. */ struct i2c_adapter_quirks { u64 flags; int max_num_msgs; u16 max_write_len; u16 max_read_len; u16 max_comb_1st_msg_len; u16 max_comb_2nd_msg_len; }; /* enforce max_num_msgs = 2 and use max_comb_*_len for length checks */ #define I2C_AQ_COMB BIT(0) /* first combined message must be write */ #define I2C_AQ_COMB_WRITE_FIRST BIT(1) /* second combined message must be read */ #define I2C_AQ_COMB_READ_SECOND BIT(2) /* both combined messages must have the same target address */ #define I2C_AQ_COMB_SAME_ADDR BIT(3) /* convenience macro for typical write-then read case */ #define I2C_AQ_COMB_WRITE_THEN_READ (I2C_AQ_COMB | I2C_AQ_COMB_WRITE_FIRST | \ I2C_AQ_COMB_READ_SECOND | I2C_AQ_COMB_SAME_ADDR) /* clock stretching is not supported */ #define I2C_AQ_NO_CLK_STRETCH BIT(4) /* message cannot have length of 0 */ #define I2C_AQ_NO_ZERO_LEN_READ BIT(5) #define I2C_AQ_NO_ZERO_LEN_WRITE BIT(6) #define I2C_AQ_NO_ZERO_LEN (I2C_AQ_NO_ZERO_LEN_READ | I2C_AQ_NO_ZERO_LEN_WRITE) /* adapter cannot do repeated START */ #define I2C_AQ_NO_REP_START BIT(7) /* * i2c_adapter is the structure used to identify a physical i2c bus along * with the access algorithms necessary to access it. */ struct i2c_adapter { struct module *owner; unsigned int class; /* classes to allow probing for */ const struct i2c_algorithm *algo; /* the algorithm to access the bus */ void *algo_data; /* data fields that are valid for all devices */ const struct i2c_lock_operations *lock_ops; struct rt_mutex bus_lock; struct rt_mutex mux_lock; int timeout; /* in jiffies */ int retries; struct device dev; /* the adapter device */ unsigned long locked_flags; /* owned by the I2C core */ #define I2C_ALF_IS_SUSPENDED 0 #define I2C_ALF_SUSPEND_REPORTED 1 int nr; char name[48]; struct completion dev_released; struct mutex userspace_clients_lock; struct list_head userspace_clients; struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; struct irq_domain *host_notify_domain; struct regulator *bus_regulator; struct dentry *debugfs; /* 7bit address space */ DECLARE_BITMAP(addrs_in_instantiation, 1 << 7); }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) static inline void *i2c_get_adapdata(const struct i2c_adapter *adap) { return dev_get_drvdata(&adap->dev); } static inline void i2c_set_adapdata(struct i2c_adapter *adap, void *data) { dev_set_drvdata(&adap->dev, data); } static inline struct i2c_adapter * i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter) { #if IS_ENABLED(CONFIG_I2C_MUX) struct device *parent = adapter->dev.parent; if (parent != NULL && parent->type == &i2c_adapter_type) return to_i2c_adapter(parent); else #endif return NULL; } int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data)); /* Adapter locking functions, exported for shared pin cases */ #define I2C_LOCK_ROOT_ADAPTER BIT(0) #define I2C_LOCK_SEGMENT BIT(1) /** * i2c_lock_bus - Get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT * locks only this branch in the adapter tree */ static inline void i2c_lock_bus(struct i2c_adapter *adapter, unsigned int flags) { adapter->lock_ops->lock_bus(adapter, flags); } /** * i2c_trylock_bus - Try to get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER tries to locks the root i2c adapter, * I2C_LOCK_SEGMENT tries to lock only this branch in the adapter tree * * Return: true if the I2C bus segment is locked, false otherwise */ static inline int i2c_trylock_bus(struct i2c_adapter *adapter, unsigned int flags) { return adapter->lock_ops->trylock_bus(adapter, flags); } /** * i2c_unlock_bus - Release exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT * unlocks only this branch in the adapter tree */ static inline void i2c_unlock_bus(struct i2c_adapter *adapter, unsigned int flags) { adapter->lock_ops->unlock_bus(adapter, flags); } /** * i2c_mark_adapter_suspended - Report suspended state of the adapter to the core * @adap: Adapter to mark as suspended * * When using this helper to mark an adapter as suspended, the core will reject * further transfers to this adapter. The usage of this helper is optional but * recommended for devices having distinct handlers for system suspend and * runtime suspend. More complex devices are free to implement custom solutions * to reject transfers when suspended. */ static inline void i2c_mark_adapter_suspended(struct i2c_adapter *adap) { i2c_lock_bus(adap, I2C_LOCK_ROOT_ADAPTER); set_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags); i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER); } /** * i2c_mark_adapter_resumed - Report resumed state of the adapter to the core * @adap: Adapter to mark as resumed * * When using this helper to mark an adapter as resumed, the core will allow * further transfers to this adapter. See also further notes to * @i2c_mark_adapter_suspended(). */ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) { i2c_lock_bus(adap, I2C_LOCK_ROOT_ADAPTER); clear_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags); i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER); } /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) /* Internal numbers to terminate lists */ #define I2C_CLIENT_END 0xfffeU /* Construct an I2C_CLIENT_END-terminated array of i2c addresses */ #define I2C_ADDRS(addr, addrs...) \ ((const unsigned short []){ addr, ## addrs, I2C_CLIENT_END }) /* ----- functions exported by i2c.o */ /* administration... */ #if IS_ENABLED(CONFIG_I2C) int i2c_add_adapter(struct i2c_adapter *adap); int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter); void i2c_del_adapter(struct i2c_adapter *adap); int i2c_add_numbered_adapter(struct i2c_adapter *adap); int i2c_register_driver(struct module *owner, struct i2c_driver *driver); void i2c_del_driver(struct i2c_driver *driver); /* use a define to avoid include chaining to get THIS_MODULE */ #define i2c_add_driver(driver) \ i2c_register_driver(THIS_MODULE, driver) static inline bool i2c_client_has_driver(struct i2c_client *client) { return !IS_ERR_OR_NULL(client) && client->dev.driver; } /* call the i2c_client->command() of all attached clients with * the given arguments */ void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg); struct i2c_adapter *i2c_get_adapter(int nr); void i2c_put_adapter(struct i2c_adapter *adap); unsigned int i2c_adapter_depth(struct i2c_adapter *adapter); void i2c_parse_fw_timings(struct device *dev, struct i2c_timings *t, bool use_defaults); /* Return the functionality mask */ static inline u32 i2c_get_functionality(struct i2c_adapter *adap) { return adap->algo->functionality(adap); } /* Return 1 if adapter supports everything we need, 0 if not. */ static inline int i2c_check_functionality(struct i2c_adapter *adap, u32 func) { return (func & i2c_get_functionality(adap)) == func; } /** * i2c_check_quirks() - Function for checking the quirk flags in an i2c adapter * @adap: i2c adapter * @quirks: quirk flags * * Return: true if the adapter has all the specified quirk flags, false if not */ static inline bool i2c_check_quirks(struct i2c_adapter *adap, u64 quirks) { if (!adap->quirks) return false; return (adap->quirks->flags & quirks) == quirks; } /* Return the adapter number for a specific adapter */ static inline int i2c_adapter_id(struct i2c_adapter *adap) { return adap->nr; } static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) { return (msg->addr << 1) | (msg->flags & I2C_M_RD); } /* * 10-bit address * addr_1: 5'b11110 | addr[9:8] | (R/nW) * addr_2: addr[7:0] */ static inline u8 i2c_10bit_addr_hi_from_msg(const struct i2c_msg *msg) { return 0xf0 | ((msg->addr & GENMASK(9, 8)) >> 7) | (msg->flags & I2C_M_RD); } static inline u8 i2c_10bit_addr_lo_from_msg(const struct i2c_msg *msg) { return msg->addr & GENMASK(7, 0); } u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred); int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); /** * module_i2c_driver() - Helper macro for registering a modular I2C driver * @__i2c_driver: i2c_driver struct * * Helper macro for I2C drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_i2c_driver(__i2c_driver) \ module_driver(__i2c_driver, i2c_add_driver, \ i2c_del_driver) /** * builtin_i2c_driver() - Helper macro for registering a builtin I2C driver * @__i2c_driver: i2c_driver struct * * Helper macro for I2C drivers which do not do anything special in their * init. This eliminates a lot of boilerplate. Each driver may only * use this macro once, and calling it replaces device_initcall(). */ #define builtin_i2c_driver(__i2c_driver) \ builtin_driver(__i2c_driver, i2c_add_driver) /* must call put_device() when done with returned i2c_client device */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode); /* must call put_device() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode); /* must call i2c_put_adapter() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode); #else /* I2C */ static inline struct i2c_client * i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) { return NULL; } static inline struct i2c_adapter * i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) { return NULL; } static inline struct i2c_adapter * i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) { return NULL; } #endif /* !I2C */ #if IS_ENABLED(CONFIG_OF) /* must call put_device() when done with returned i2c_client device */ static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) { return i2c_find_device_by_fwnode(of_fwnode_handle(node)); } /* must call put_device() when done with returned i2c_adapter device */ static inline struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node) { return i2c_find_adapter_by_fwnode(of_fwnode_handle(node)); } /* must call i2c_put_adapter() when done with returned i2c_adapter device */ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node) { return i2c_get_adapter_by_fwnode(of_fwnode_handle(node)); } int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info); #else static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) { return NULL; } static inline struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node) { return NULL; } static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node) { return NULL; } static inline int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info) { return -ENOTSUPP; } #endif /* CONFIG_OF */ struct acpi_resource; struct acpi_resource_i2c_serialbus; #if IS_REACHABLE(CONFIG_ACPI) && IS_REACHABLE(CONFIG_I2C) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); u32 i2c_acpi_find_bus_speed(struct device *dev); struct i2c_client *i2c_acpi_new_device_by_fwnode(struct fwnode_handle *fwnode, int index, struct i2c_board_info *info); struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle); bool i2c_acpi_waive_d0_probe(struct device *dev); #else static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c) { return false; } static inline int i2c_acpi_client_count(struct acpi_device *adev) { return 0; } static inline u32 i2c_acpi_find_bus_speed(struct device *dev) { return 0; } static inline struct i2c_client *i2c_acpi_new_device_by_fwnode( struct fwnode_handle *fwnode, int index, struct i2c_board_info *info) { return ERR_PTR(-ENODEV); } static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) { return NULL; } static inline bool i2c_acpi_waive_d0_probe(struct device *dev) { return false; } #endif /* CONFIG_ACPI */ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, struct i2c_board_info *info) { return i2c_acpi_new_device_by_fwnode(dev_fwnode(dev), index, info); } #endif /* _LINUX_I2C_H */
30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEGIS common definitions * * Copyright (c) 2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (c) 2018 Red Hat, Inc. All rights reserved. */ #ifndef _CRYPTO_AEGIS_H #define _CRYPTO_AEGIS_H #include <crypto/aes.h> #include <linux/bitops.h> #include <linux/types.h> #define AEGIS_BLOCK_SIZE 16 union aegis_block { __le64 words64[AEGIS_BLOCK_SIZE / sizeof(__le64)]; __le32 words32[AEGIS_BLOCK_SIZE / sizeof(__le32)]; u8 bytes[AEGIS_BLOCK_SIZE]; }; struct aegis_state; extern int aegis128_have_aes_insn; #define AEGIS_BLOCK_ALIGN (__alignof__(union aegis_block)) #define AEGIS_ALIGNED(p) IS_ALIGNED((uintptr_t)p, AEGIS_BLOCK_ALIGN) bool crypto_aegis128_have_simd(void); void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg); void crypto_aegis128_init_simd(struct aegis_state *state, const union aegis_block *key, const u8 *iv); void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size); void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size); int crypto_aegis128_final_simd(struct aegis_state *state, union aegis_block *tag_xor, unsigned int assoclen, unsigned int cryptlen, unsigned int authsize); static __always_inline void crypto_aegis_block_xor(union aegis_block *dst, const union aegis_block *src) { dst->words64[0] ^= src->words64[0]; dst->words64[1] ^= src->words64[1]; } static __always_inline void crypto_aegis_block_and(union aegis_block *dst, const union aegis_block *src) { dst->words64[0] &= src->words64[0]; dst->words64[1] &= src->words64[1]; } static __always_inline void crypto_aegis_aesenc(union aegis_block *dst, const union aegis_block *src, const union aegis_block *key) { const u8 *s = src->bytes; const u32 *t = crypto_ft_tab[0]; u32 d0, d1, d2, d3; d0 = t[s[ 0]] ^ rol32(t[s[ 5]], 8) ^ rol32(t[s[10]], 16) ^ rol32(t[s[15]], 24); d1 = t[s[ 4]] ^ rol32(t[s[ 9]], 8) ^ rol32(t[s[14]], 16) ^ rol32(t[s[ 3]], 24); d2 = t[s[ 8]] ^ rol32(t[s[13]], 8) ^ rol32(t[s[ 2]], 16) ^ rol32(t[s[ 7]], 24); d3 = t[s[12]] ^ rol32(t[s[ 1]], 8) ^ rol32(t[s[ 6]], 16) ^ rol32(t[s[11]], 24); dst->words32[0] = cpu_to_le32(d0) ^ key->words32[0]; dst->words32[1] = cpu_to_le32(d1) ^ key->words32[1]; dst->words32[2] = cpu_to_le32(d2) ^ key->words32[2]; dst->words32[3] = cpu_to_le32(d3) ^ key->words32[3]; } #endif /* _CRYPTO_AEGIS_H */
214 214 214 214 214 214 53 53 53 53 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/init.h> #include <linux/module.h> #include <linux/fs_context.h> #define FUSE_CTL_SUPER_MAGIC 0x65735543 /* * This is non-NULL when the single instance of the control filesystem * exists. Protected by fuse_mutex */ static struct super_block *fuse_control_sb; static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { struct fuse_conn *fc; mutex_lock(&fuse_mutex); fc = file_inode(file)->i_private; if (fc) fc = fuse_conn_get(fc); mutex_unlock(&fuse_mutex); return fc; } static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { if (fc->abort_err) fc->aborted = true; fuse_abort_conn(fc); fuse_conn_put(fc); } return count; } static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { char tmp[32]; size_t size; if (!*ppos) { long value; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; value = atomic_read(&fc->num_waiting); file->private_data = (void *)value; fuse_conn_put(fc); } size = sprintf(tmp, "%ld\n", (long)file->private_data); return simple_read_from_buffer(buf, len, ppos, tmp, size); } static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf, size_t len, loff_t *ppos, unsigned val) { char tmp[32]; size_t size = sprintf(tmp, "%u\n", val); return simple_read_from_buffer(buf, len, ppos, tmp, size); } static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, unsigned *val, unsigned global_limit) { unsigned long t; unsigned limit = (1 << 16) - 1; int err; if (*ppos) return -EINVAL; err = kstrtoul_from_user(buf, count, 0, &t); if (err) return err; if (!capable(CAP_SYS_ADMIN)) limit = min(limit, global_limit); if (t > limit) return -EINVAL; *val = t; return count; } static ssize_t fuse_conn_max_background_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct fuse_conn *fc; unsigned val; fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; val = READ_ONCE(fc->max_background); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); } static ssize_t fuse_conn_max_background_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned val; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_bgreq); if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { spin_lock(&fc->bg_lock); fc->max_background = val; fc->blocked = fc->num_background >= fc->max_background; if (!fc->blocked) wake_up(&fc->blocked_waitq); spin_unlock(&fc->bg_lock); fuse_conn_put(fc); } } return ret; } static ssize_t fuse_conn_congestion_threshold_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct fuse_conn *fc; unsigned val; fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; val = READ_ONCE(fc->congestion_threshold); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); } static ssize_t fuse_conn_congestion_threshold_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned val; struct fuse_conn *fc; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); if (ret <= 0) goto out; fc = fuse_ctl_file_conn_get(file); if (!fc) goto out; WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; } static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, }; static const struct file_operations fuse_ctl_waiting_ops = { .open = nonseekable_open, .read = fuse_conn_waiting_read, }; static const struct file_operations fuse_conn_max_background_ops = { .open = nonseekable_open, .read = fuse_conn_max_background_read, .write = fuse_conn_max_background_write, }; static const struct file_operations fuse_conn_congestion_threshold_ops = { .open = nonseekable_open, .read = fuse_conn_congestion_threshold_read, .write = fuse_conn_congestion_threshold_write, }; static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, const char *name, int mode, int nlink, const struct inode_operations *iop, const struct file_operations *fop) { struct dentry *dentry; struct inode *inode; BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); dentry = d_alloc_name(parent, name); if (!dentry) return NULL; inode = new_inode(fuse_control_sb); if (!inode) { dput(dentry); return NULL; } inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; simple_inode_init_ts(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; inode->i_fop = fop; set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); fc->ctl_dentry[fc->ctl_ndents++] = dentry; return dentry; } /* * Add a connection to the control filesystem (if it exists). Caller * must hold fuse_mutex */ int fuse_ctl_add_conn(struct fuse_conn *fc) { struct dentry *parent; char name[32]; if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; inc_nlink(d_inode(parent)); sprintf(name, "%u", fc->dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, &simple_dir_inode_operations, &simple_dir_operations); if (!parent) goto err; if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, NULL, &fuse_ctl_waiting_ops) || !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, NULL, &fuse_ctl_abort_ops) || !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, 1, NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", S_IFREG | 0600, 1, NULL, &fuse_conn_congestion_threshold_ops)) goto err; return 0; err: fuse_ctl_remove_conn(fc); return -ENOMEM; } /* * Remove a connection from the control filesystem (if it exists). * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { int i; if (!fuse_control_sb || fc->no_control) return; for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; d_inode(dentry)->i_private = NULL; if (!i) { /* Get rid of submounts: */ d_invalidate(dentry); } dput(dentry); } drop_nlink(d_inode(fuse_control_sb->s_root)); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) { static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; int err; err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); if (err) return err; mutex_lock(&fuse_mutex); BUG_ON(fuse_control_sb); fuse_control_sb = sb; list_for_each_entry(fc, &fuse_conn_list, entry) { err = fuse_ctl_add_conn(fc); if (err) { fuse_control_sb = NULL; mutex_unlock(&fuse_mutex); return err; } } mutex_unlock(&fuse_mutex); return 0; } static int fuse_ctl_get_tree(struct fs_context *fsc) { return get_tree_single(fsc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { .get_tree = fuse_ctl_get_tree, }; static int fuse_ctl_init_fs_context(struct fs_context *fsc) { fsc->ops = &fuse_ctl_context_ops; return 0; } static void fuse_ctl_kill_sb(struct super_block *sb) { struct fuse_conn *fc; mutex_lock(&fuse_mutex); fuse_control_sb = NULL; list_for_each_entry(fc, &fuse_conn_list, entry) fc->ctl_ndents = 0; mutex_unlock(&fuse_mutex); kill_litter_super(sb); } static struct file_system_type fuse_ctl_fs_type = { .owner = THIS_MODULE, .name = "fusectl", .init_fs_context = fuse_ctl_init_fs_context, .kill_sb = fuse_ctl_kill_sb, }; MODULE_ALIAS_FS("fusectl"); int __init fuse_ctl_init(void) { return register_filesystem(&fuse_ctl_fs_type); } void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); }
3 1 2 528 529 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> #include <linux/netdevice.h> #include <net/tcx.h> int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { bool created, ingress = attr->attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; struct bpf_mprog_entry *entry, *entry_new; struct bpf_prog *replace_prog = NULL; struct net_device *dev; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->target_ifindex); if (!dev) { ret = -ENODEV; goto out; } if (attr->attach_flags & BPF_F_REPLACE) { replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, prog->type); if (IS_ERR(replace_prog)) { ret = PTR_ERR(replace_prog); replace_prog = NULL; goto out; } } entry = tcx_entry_fetch_or_create(dev, ingress, &created); if (!entry) { ret = -ENOMEM; goto out; } ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, attr->attach_flags, attr->relative_fd, attr->expected_revision); if (!ret) { if (entry != entry_new) { tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_inc(ingress); } bpf_mprog_commit(entry); } else if (created) { tcx_entry_free(entry); } out: if (replace_prog) bpf_prog_put(replace_prog); rtnl_unlock(); return ret; } int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) { bool ingress = attr->attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->target_ifindex); if (!dev) { ret = -ENODEV; goto out; } entry = tcx_entry_fetch(dev, ingress); if (!entry) { ret = -ENOENT; goto out; } ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, attr->relative_fd, attr->expected_revision); if (!ret) { if (!tcx_entry_is_active(entry_new)) entry_new = NULL; tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_dec(ingress); bpf_mprog_commit(entry); if (!entry_new) tcx_entry_free(entry); } out: rtnl_unlock(); return ret; } void tcx_uninstall(struct net_device *dev, bool ingress) { struct bpf_mprog_entry *entry, *entry_new = NULL; struct bpf_tuple tuple = {}; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; bool active; entry = tcx_entry_fetch(dev, ingress); if (!entry) return; active = tcx_entry(entry)->miniq_active; if (active) bpf_mprog_clear_all(entry, &entry_new); tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { if (tuple.link) tcx_link(tuple.link)->dev = NULL; else bpf_prog_put(tuple.prog); tcx_skeys_dec(ingress); } if (!active) tcx_entry_free(entry); } int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; struct net_device *dev; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->query.target_ifindex); if (!dev) { ret = -ENODEV; goto out; } ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress)); out: rtnl_unlock(); return ret; } static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct tcx_link *tcx = tcx_link(link); bool created, ingress = tcx->location == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev = tcx->dev; int ret; ASSERT_RTNL(); entry = tcx_entry_fetch_or_create(dev, ingress, &created); if (!entry) return -ENOMEM; ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, id_or_fd, revision); if (!ret) { if (entry != entry_new) { tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_inc(ingress); } bpf_mprog_commit(entry); } else if (created) { tcx_entry_free(entry); } return ret; } static void tcx_link_release(struct bpf_link *link) { struct tcx_link *tcx = tcx_link(link); bool ingress = tcx->location == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; rtnl_lock(); dev = tcx->dev; if (!dev) goto out; entry = tcx_entry_fetch(dev, ingress); if (!entry) { ret = -ENOENT; goto out; } ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); if (!ret) { if (!tcx_entry_is_active(entry_new)) entry_new = NULL; tcx_entry_update(dev, entry_new, ingress); tcx_entry_sync(); tcx_skeys_dec(ingress); bpf_mprog_commit(entry); if (!entry_new) tcx_entry_free(entry); tcx->dev = NULL; } out: WARN_ON_ONCE(ret); rtnl_unlock(); } static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, struct bpf_prog *oprog) { struct tcx_link *tcx = tcx_link(link); bool ingress = tcx->location == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; rtnl_lock(); dev = tcx->dev; if (!dev) { ret = -ENOLINK; goto out; } if (oprog && link->prog != oprog) { ret = -EPERM; goto out; } oprog = link->prog; if (oprog == nprog) { bpf_prog_put(nprog); goto out; } entry = tcx_entry_fetch(dev, ingress); if (!entry) { ret = -ENOENT; goto out; } ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, BPF_F_REPLACE | BPF_F_ID, link->prog->aux->id, 0); if (!ret) { WARN_ON_ONCE(entry != entry_new); oprog = xchg(&link->prog, nprog); bpf_prog_put(oprog); bpf_mprog_commit(entry); } out: rtnl_unlock(); return ret; } static void tcx_link_dealloc(struct bpf_link *link) { kfree(tcx_link(link)); } static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) { const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); if (tcx->dev) ifindex = tcx->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); seq_printf(seq, "attach_type:\t%u (%s)\n", tcx->location, tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); } static int tcx_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); if (tcx->dev) ifindex = tcx->dev->ifindex; rtnl_unlock(); info->tcx.ifindex = ifindex; info->tcx.attach_type = tcx->location; return 0; } static int tcx_link_detach(struct bpf_link *link) { tcx_link_release(link); return 0; } static const struct bpf_link_ops tcx_link_lops = { .release = tcx_link_release, .detach = tcx_link_detach, .dealloc = tcx_link_dealloc, .update_prog = tcx_link_update, .show_fdinfo = tcx_link_fdinfo, .fill_link_info = tcx_link_fill_info, }; static int tcx_link_init(struct tcx_link *tcx, struct bpf_link_primer *link_primer, const union bpf_attr *attr, struct net_device *dev, struct bpf_prog *prog) { bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); tcx->location = attr->link_create.attach_type; tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); } int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct net_device *dev; struct tcx_link *tcx; int ret; rtnl_lock(); dev = __dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { ret = -ENODEV; goto out; } tcx = kzalloc(sizeof(*tcx), GFP_USER); if (!tcx) { ret = -ENOMEM; goto out; } ret = tcx_link_init(tcx, &link_primer, attr, dev, prog); if (ret) { kfree(tcx); goto out; } ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags, attr->link_create.tcx.relative_fd, attr->link_create.tcx.expected_revision); if (ret) { tcx->dev = NULL; bpf_link_cleanup(&link_primer); goto out; } ret = bpf_link_settle(&link_primer); out: rtnl_unlock(); return ret; }
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 6 6 2 1 2 2 2 2 8 8 4 6 6 2 2 2 2 1 7 7 7 8 8 7 2 2 2 7 11 1 2 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the VoIP USB phones with CM109 chipsets. * * Copyright (C) 2007 - 2008 Alfred E. Heggestad <aeh@db.org> */ /* * Tested devices: * - Komunikate KIP1000 * - Genius G-talk * - Allied-Telesis Corega USBPH01 * - ... * * This driver is based on the yealink.c driver * * Thanks to: * - Authors of yealink.c * - Thomas Reitmayr * - Oliver Neukum for good review comments and code * - Shaun Jackman <sjackman@gmail.com> for Genius G-talk keymap * - Dmitry Torokhov for valuable input and review * * Todo: * - Read/write EEPROM */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/rwsem.h> #include <linux/usb/input.h> #define DRIVER_VERSION "20080805" #define DRIVER_AUTHOR "Alfred E. Heggestad" #define DRIVER_DESC "CM109 phone driver" static char *phone = "kip1000"; module_param(phone, charp, S_IRUSR); MODULE_PARM_DESC(phone, "Phone name {kip1000, gtalk, usbph01, atcom}"); enum { /* HID Registers */ HID_IR0 = 0x00, /* Record/Playback-mute button, Volume up/down */ HID_IR1 = 0x01, /* GPI, generic registers or EEPROM_DATA0 */ HID_IR2 = 0x02, /* Generic registers or EEPROM_DATA1 */ HID_IR3 = 0x03, /* Generic registers or EEPROM_CTRL */ HID_OR0 = 0x00, /* Mapping control, buzzer, SPDIF (offset 0x04) */ HID_OR1 = 0x01, /* GPO - General Purpose Output */ HID_OR2 = 0x02, /* Set GPIO to input/output mode */ HID_OR3 = 0x03, /* SPDIF status channel or EEPROM_CTRL */ /* HID_IR0 */ RECORD_MUTE = 1 << 3, PLAYBACK_MUTE = 1 << 2, VOLUME_DOWN = 1 << 1, VOLUME_UP = 1 << 0, /* HID_OR0 */ /* bits 7-6 0: HID_OR1-2 are used for GPO; HID_OR0, 3 are used for buzzer and SPDIF 1: HID_OR0-3 are used as generic HID registers 2: Values written to HID_OR0-3 are also mapped to MCU_CTRL, EEPROM_DATA0-1, EEPROM_CTRL (see Note) 3: Reserved */ HID_OR_GPO_BUZ_SPDIF = 0 << 6, HID_OR_GENERIC_HID_REG = 1 << 6, HID_OR_MAP_MCU_EEPROM = 2 << 6, BUZZER_ON = 1 << 5, /* up to 256 normal keys, up to 15 special key combinations */ KEYMAP_SIZE = 256 + 15, }; /* CM109 protocol packet */ struct cm109_ctl_packet { u8 byte[4]; } __attribute__ ((packed)); enum { USB_PKT_LEN = sizeof(struct cm109_ctl_packet) }; /* CM109 device structure */ struct cm109_dev { struct input_dev *idev; /* input device */ struct usb_device *udev; /* usb device */ struct usb_interface *intf; /* irq input channel */ struct cm109_ctl_packet *irq_data; dma_addr_t irq_dma; struct urb *urb_irq; /* control output channel */ struct cm109_ctl_packet *ctl_data; dma_addr_t ctl_dma; struct usb_ctrlrequest *ctl_req; struct urb *urb_ctl; /* * The 3 bitfields below are protected by ctl_submit_lock. * They have to be separate since they are accessed from IRQ * context. */ unsigned irq_urb_pending:1; /* irq_urb is in flight */ unsigned ctl_urb_pending:1; /* ctl_urb is in flight */ unsigned buzzer_pending:1; /* need to issue buzz command */ spinlock_t ctl_submit_lock; unsigned char buzzer_state; /* on/off */ /* flags */ unsigned open:1; unsigned resetting:1; unsigned shutdown:1; /* This mutex protects writes to the above flags */ struct mutex pm_mutex; unsigned short keymap[KEYMAP_SIZE]; char phys[64]; /* physical device path */ int key_code; /* last reported key */ int keybit; /* 0=new scan 1,2,4,8=scan columns */ u8 gpi; /* Cached value of GPI (high nibble) */ }; /****************************************************************************** * CM109 key interface *****************************************************************************/ static unsigned short special_keymap(int code) { if (code > 0xff) { switch (code - 0xff) { case RECORD_MUTE: return KEY_MICMUTE; case PLAYBACK_MUTE: return KEY_MUTE; case VOLUME_DOWN: return KEY_VOLUMEDOWN; case VOLUME_UP: return KEY_VOLUMEUP; } } return KEY_RESERVED; } /* Map device buttons to internal key events. * * The "up" and "down" keys, are symbolised by arrows on the button. * The "pickup" and "hangup" keys are symbolised by a green and red phone * on the button. Komunikate KIP1000 Keyboard Matrix -> -- 1 -- 2 -- 3 --> GPI pin 4 (0x10) | | | | <- -- 4 -- 5 -- 6 --> GPI pin 5 (0x20) | | | | END - 7 -- 8 -- 9 --> GPI pin 6 (0x40) | | | | OK -- * -- 0 -- # --> GPI pin 7 (0x80) | | | | /|\ /|\ /|\ /|\ | | | | GPO pin: 3 2 1 0 0x8 0x4 0x2 0x1 */ static unsigned short keymap_kip1000(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x14: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x11: return KEY_NUMERIC_3; /* 3 */ case 0x24: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x21: return KEY_NUMERIC_6; /* 6 */ case 0x44: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x41: return KEY_NUMERIC_9; /* 9 */ case 0x81: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x88: return KEY_ENTER; /* pickup */ case 0x48: return KEY_ESC; /* hangup */ case 0x28: return KEY_LEFT; /* IN */ case 0x18: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* Contributed by Shaun Jackman <sjackman@gmail.com> Genius G-Talk keyboard matrix 0 1 2 3 4: 0 4 8 Talk 5: 1 5 9 End 6: 2 6 # Up 7: 3 7 * Down */ static unsigned short keymap_gtalk(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; case 0x21: return KEY_NUMERIC_1; case 0x41: return KEY_NUMERIC_2; case 0x81: return KEY_NUMERIC_3; case 0x12: return KEY_NUMERIC_4; case 0x22: return KEY_NUMERIC_5; case 0x42: return KEY_NUMERIC_6; case 0x82: return KEY_NUMERIC_7; case 0x14: return KEY_NUMERIC_8; case 0x24: return KEY_NUMERIC_9; case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* Talk (green handset) */ case 0x28: return KEY_ESC; /* End (red handset) */ case 0x48: return KEY_UP; /* Menu up (rocker switch) */ case 0x88: return KEY_DOWN; /* Menu down (rocker switch) */ default: return special_keymap(scancode); } } /* * Keymap for Allied-Telesis Corega USBPH01 * http://www.alliedtelesis-corega.com/2/1344/1437/1360/chprd.html * * Contributed by july@nat.bg */ static unsigned short keymap_usbph01(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; /* 0 */ case 0x21: return KEY_NUMERIC_1; /* 1 */ case 0x41: return KEY_NUMERIC_2; /* 2 */ case 0x81: return KEY_NUMERIC_3; /* 3 */ case 0x12: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x42: return KEY_NUMERIC_6; /* 6 */ case 0x82: return KEY_NUMERIC_7; /* 7 */ case 0x14: return KEY_NUMERIC_8; /* 8 */ case 0x24: return KEY_NUMERIC_9; /* 9 */ case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* IN */ case 0x88: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* * Keymap for ATCom AU-100 * http://www.atcom.cn/products.html * http://www.packetizer.com/products/au100/ * http://www.voip-info.org/wiki/view/AU-100 * * Contributed by daniel@gimpelevich.san-francisco.ca.us */ static unsigned short keymap_atcom(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x11: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x14: return KEY_NUMERIC_3; /* 3 */ case 0x21: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x24: return KEY_NUMERIC_6; /* 6 */ case 0x41: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x44: return KEY_NUMERIC_9; /* 9 */ case 0x84: return KEY_NUMERIC_POUND; /* # */ case 0x81: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* left arrow */ case 0x88: return KEY_RIGHT; /* right arrow */ default: return special_keymap(scancode); } } static unsigned short (*keymap)(int) = keymap_kip1000; /* * Completes a request by converting the data into events for the * input subsystem. */ static void report_key(struct cm109_dev *dev, int key) { struct input_dev *idev = dev->idev; if (dev->key_code >= 0) { /* old key up */ input_report_key(idev, dev->key_code, 0); } dev->key_code = key; if (key >= 0) { /* new valid key */ input_report_key(idev, key, 1); } input_sync(idev); } /* * Converts data of special key presses (volume, mute) into events * for the input subsystem, sends press-n-release for mute keys. */ static void cm109_report_special(struct cm109_dev *dev) { static const u8 autorelease = RECORD_MUTE | PLAYBACK_MUTE; struct input_dev *idev = dev->idev; u8 data = dev->irq_data->byte[HID_IR0]; unsigned short keycode; int i; for (i = 0; i < 4; i++) { keycode = dev->keymap[0xff + BIT(i)]; if (keycode == KEY_RESERVED) continue; input_report_key(idev, keycode, data & BIT(i)); if (data & autorelease & BIT(i)) { input_sync(idev); input_report_key(idev, keycode, 0); } } input_sync(idev); } /****************************************************************************** * CM109 usb communication interface *****************************************************************************/ static void cm109_submit_buzz_toggle(struct cm109_dev *dev) { int error; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } static void cm109_submit_ctl(struct cm109_dev *dev) { int error; guard(spinlock_irqsave)(&dev->ctl_submit_lock); dev->irq_urb_pending = 0; if (unlikely(dev->shutdown)) return; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } /* * IRQ handler */ static void cm109_urb_irq_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; dev_dbg(&dev->intf->dev, "### URB IRQ: [0x%02x 0x%02x 0x%02x 0x%02x] keybit=0x%02x\n", dev->irq_data->byte[0], dev->irq_data->byte[1], dev->irq_data->byte[2], dev->irq_data->byte[3], dev->keybit); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); goto out; } /* Special keys */ cm109_report_special(dev); /* Scan key column */ if (dev->keybit == 0xf) { /* Any changes ? */ if ((dev->gpi & 0xf0) == (dev->irq_data->byte[HID_IR1] & 0xf0)) goto out; dev->gpi = dev->irq_data->byte[HID_IR1] & 0xf0; dev->keybit = 0x1; } else { report_key(dev, dev->keymap[dev->irq_data->byte[HID_IR1]]); dev->keybit <<= 1; if (dev->keybit > 0x8) dev->keybit = 0xf; } out: cm109_submit_ctl(dev); } static void cm109_urb_ctl_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; dev_dbg(&dev->intf->dev, "### URB CTL: [0x%02x 0x%02x 0x%02x 0x%02x]\n", dev->ctl_data->byte[0], dev->ctl_data->byte[1], dev->ctl_data->byte[2], dev->ctl_data->byte[3]); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); } guard(spinlock_irqsave)(&dev->ctl_submit_lock); dev->ctl_urb_pending = 0; if (unlikely(dev->shutdown)) return; if (dev->buzzer_pending || status) { dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } else if (likely(!dev->irq_urb_pending)) { /* ask for key data */ dev->irq_urb_pending = 1; error = usb_submit_urb(dev->urb_irq, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_irq) failed %d\n", __func__, error); } } static void cm109_toggle_buzzer_async(struct cm109_dev *dev) { guard(spinlock_irqsave)(&dev->ctl_submit_lock); if (dev->ctl_urb_pending) { /* URB completion will resubmit */ dev->buzzer_pending = 1; } else { dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } } static void cm109_toggle_buzzer_sync(struct cm109_dev *dev, int on) { int error; if (on) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), dev->ctl_req->bRequest, dev->ctl_req->bRequestType, le16_to_cpu(dev->ctl_req->wValue), le16_to_cpu(dev->ctl_req->wIndex), dev->ctl_data, USB_PKT_LEN, USB_CTRL_SET_TIMEOUT); if (error < 0 && error != -EINTR) dev_err(&dev->intf->dev, "%s: usb_control_msg() failed %d\n", __func__, error); } static void cm109_stop_traffic(struct cm109_dev *dev) { dev->shutdown = 1; /* * Make sure other CPUs see this */ smp_wmb(); usb_kill_urb(dev->urb_ctl); usb_kill_urb(dev->urb_irq); cm109_toggle_buzzer_sync(dev, 0); dev->shutdown = 0; smp_wmb(); } static void cm109_restore_state(struct cm109_dev *dev) { if (dev->open) { /* * Restore buzzer state. * This will also kick regular URB submission */ cm109_toggle_buzzer_async(dev); } } /****************************************************************************** * input event interface *****************************************************************************/ static int cm109_input_open(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); int error; error = usb_autopm_get_interface(dev->intf); if (error < 0) { dev_err(&idev->dev, "%s - cannot autoresume, result %d\n", __func__, error); return error; } scoped_guard(mutex, &dev->pm_mutex) { dev->buzzer_state = 0; dev->key_code = -1; /* no keys pressed */ dev->keybit = 0xf; /* issue INIT */ dev->ctl_data->byte[HID_OR0] = HID_OR_GPO_BUZ_SPDIF; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->ctl_data->byte[HID_OR3] = 0x00; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_KERNEL); if (!error) { dev->open = 1; return 0; } } dev->ctl_urb_pending = 0; usb_autopm_put_interface(dev->intf); dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); return error; } static void cm109_input_close(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); scoped_guard(mutex, &dev->pm_mutex) { /* * Once we are here event delivery is stopped so we * don't need to worry about someone starting buzzer * again */ cm109_stop_traffic(dev); dev->open = 0; } usb_autopm_put_interface(dev->intf); } static int cm109_input_ev(struct input_dev *idev, unsigned int type, unsigned int code, int value) { struct cm109_dev *dev = input_get_drvdata(idev); dev_dbg(&dev->intf->dev, "input_ev: type=%u code=%u value=%d\n", type, code, value); if (type != EV_SND) return -EINVAL; switch (code) { case SND_TONE: case SND_BELL: dev->buzzer_state = !!value; if (!dev->resetting) cm109_toggle_buzzer_async(dev); return 0; default: return -EINVAL; } } /****************************************************************************** * Linux interface and usb initialisation *****************************************************************************/ struct driver_info { char *name; }; static const struct driver_info info_cm109 = { .name = "CM109 USB driver", }; enum { VENDOR_ID = 0x0d8c, /* C-Media Electronics */ PRODUCT_ID_CM109 = 0x000e, /* CM109 defines range 0x0008 - 0x000f */ }; /* table of devices that work with this driver */ static const struct usb_device_id cm109_usb_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = VENDOR_ID, .idProduct = PRODUCT_ID_CM109, .bInterfaceClass = USB_CLASS_HID, .bInterfaceSubClass = 0, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t) &info_cm109 }, /* you can add more devices here with product ID 0x0008 - 0x000f */ { } }; static void cm109_usb_cleanup(struct cm109_dev *dev) { kfree(dev->ctl_req); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->ctl_data, dev->ctl_dma); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->irq_data, dev->irq_dma); usb_free_urb(dev->urb_irq); /* parameter validation in core/urb */ usb_free_urb(dev->urb_ctl); /* parameter validation in core/urb */ kfree(dev); } static void cm109_usb_disconnect(struct usb_interface *interface) { struct cm109_dev *dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); input_unregister_device(dev->idev); cm109_usb_cleanup(dev); } static int cm109_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct driver_info *nfo = (struct driver_info *)id->driver_info; struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct cm109_dev *dev; struct input_dev *input_dev = NULL; int ret, pipe, i; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -ENODEV; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -ENODEV; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; spin_lock_init(&dev->ctl_submit_lock); mutex_init(&dev->pm_mutex); dev->udev = udev; dev->intf = intf; dev->idev = input_dev = input_allocate_device(); if (!input_dev) goto err_out; /* allocate usb buffers */ dev->irq_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->irq_dma); if (!dev->irq_data) goto err_out; dev->ctl_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->ctl_dma); if (!dev->ctl_data) goto err_out; dev->ctl_req = kmalloc(sizeof(*(dev->ctl_req)), GFP_KERNEL); if (!dev->ctl_req) goto err_out; /* allocate urb structures */ dev->urb_irq = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_irq) goto err_out; dev->urb_ctl = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_ctl) goto err_out; /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); ret = usb_maxpacket(udev, pipe); if (ret != USB_PKT_LEN) dev_err(&intf->dev, "invalid payload size %d, expected %d\n", ret, USB_PKT_LEN); /* initialise irq urb */ usb_fill_int_urb(dev->urb_irq, udev, pipe, dev->irq_data, USB_PKT_LEN, cm109_urb_irq_callback, dev, endpoint->bInterval); dev->urb_irq->transfer_dma = dev->irq_dma; dev->urb_irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_irq->dev = udev; /* initialise ctl urb */ dev->ctl_req->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT; dev->ctl_req->bRequest = USB_REQ_SET_CONFIGURATION; dev->ctl_req->wValue = cpu_to_le16(0x200); dev->ctl_req->wIndex = cpu_to_le16(interface->desc.bInterfaceNumber); dev->ctl_req->wLength = cpu_to_le16(USB_PKT_LEN); usb_fill_control_urb(dev->urb_ctl, udev, usb_sndctrlpipe(udev, 0), (void *)dev->ctl_req, dev->ctl_data, USB_PKT_LEN, cm109_urb_ctl_callback, dev); dev->urb_ctl->transfer_dma = dev->ctl_dma; dev->urb_ctl->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_ctl->dev = udev; /* find out the physical bus location */ usb_make_path(udev, dev->phys, sizeof(dev->phys)); strlcat(dev->phys, "/input0", sizeof(dev->phys)); /* register settings for the input device */ input_dev->name = nfo->name; input_dev->phys = dev->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, dev); input_dev->open = cm109_input_open; input_dev->close = cm109_input_close; input_dev->event = cm109_input_ev; input_dev->keycode = dev->keymap; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(dev->keymap); input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_SND); input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); /* register available key events */ for (i = 0; i < KEYMAP_SIZE; i++) { unsigned short k = keymap(i); dev->keymap[i] = k; __set_bit(k, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); error = input_register_device(dev->idev); if (error) goto err_out; usb_set_intfdata(intf, dev); return 0; err_out: input_free_device(input_dev); cm109_usb_cleanup(dev); return error; } static int cm109_usb_suspend(struct usb_interface *intf, pm_message_t message) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_suspend (event=%d)\n", message.event); guard(mutex)(&dev->pm_mutex); cm109_stop_traffic(dev); return 0; } static int cm109_usb_resume(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_resume\n"); guard(mutex)(&dev->pm_mutex); cm109_restore_state(dev); return 0; } static int cm109_usb_pre_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); mutex_lock(&dev->pm_mutex); /* * Make sure input events don't try to toggle buzzer * while we are resetting */ dev->resetting = 1; smp_wmb(); cm109_stop_traffic(dev); return 0; } static int cm109_usb_post_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev->resetting = 0; smp_wmb(); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static struct usb_driver cm109_driver = { .name = "cm109", .probe = cm109_usb_probe, .disconnect = cm109_usb_disconnect, .suspend = cm109_usb_suspend, .resume = cm109_usb_resume, .reset_resume = cm109_usb_resume, .pre_reset = cm109_usb_pre_reset, .post_reset = cm109_usb_post_reset, .id_table = cm109_usb_table, .supports_autosuspend = 1, }; static int __init cm109_select_keymap(void) { /* Load the phone keymap */ if (!strcasecmp(phone, "kip1000")) { keymap = keymap_kip1000; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Komunikate KIP1000 phone loaded\n"); } else if (!strcasecmp(phone, "gtalk")) { keymap = keymap_gtalk; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Genius G-talk phone loaded\n"); } else if (!strcasecmp(phone, "usbph01")) { keymap = keymap_usbph01; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Allied-Telesis Corega USBPH01 phone loaded\n"); } else if (!strcasecmp(phone, "atcom")) { keymap = keymap_atcom; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for ATCom AU-100 phone loaded\n"); } else { printk(KERN_ERR KBUILD_MODNAME ": " "Unsupported phone: %s\n", phone); return -EINVAL; } return 0; } static int __init cm109_init(void) { int err; err = cm109_select_keymap(); if (err) return err; err = usb_register(&cm109_driver); if (err) return err; printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_DESC ": " DRIVER_VERSION " (C) " DRIVER_AUTHOR "\n"); return 0; } static void __exit cm109_exit(void) { usb_deregister(&cm109_driver); } module_init(cm109_init); module_exit(cm109_exit); MODULE_DEVICE_TABLE(usb, cm109_usb_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
170 170 170 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_ftp.c: IPVS ftp application module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: * * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. * * IP_MASQ_FTP ftp masquerading module * * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 * * Author: Wouter Gadeyne */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> #include <linux/unaligned.h> #include <net/ip_vs.h> #define SERVER_STRING_PASV "227 " #define CLIENT_STRING_PORT "PORT" #define SERVER_STRING_EPSV "229 " #define CLIENT_STRING_EPRT "EPRT" enum { IP_VS_FTP_ACTIVE = 0, IP_VS_FTP_PORT = 0, IP_VS_FTP_PASV, IP_VS_FTP_EPRT, IP_VS_FTP_EPSV, }; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. */ static unsigned int ports_count = 1; static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) { struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); if ((th->doff << 2) < sizeof(struct tcphdr)) return NULL; return (char *)th + (th->doff << 2); } static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { /* We use connection tracking for the command connection */ cp->flags |= IP_VS_CONN_F_NFCT; return 0; } static int ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { return 0; } /* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started * with the "pattern". <addr,port> is in network order. * Parse extended format depending on ext. In this case addr can be pre-set. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, const char *pattern, size_t plen, char skip, bool ext, int mode, union nf_inet_addr *addr, __be16 *port, __u16 af, char **start, char **end) { char *s, c; unsigned char p[6]; char edelim; __u16 hport; int i = 0; if (data_limit - data < plen) { /* check if there is partial match */ if (strncasecmp(data, pattern, data_limit - data) == 0) return -1; else return 0; } if (strncasecmp(data, pattern, plen) != 0) { return 0; } s = data + plen; if (skip) { bool found = false; for (;; s++) { if (s == data_limit) return -1; if (!found) { /* "(" is optional for non-extended format, * so catch the start of IPv4 address */ if (!ext && isdigit(*s)) break; if (*s == skip) found = true; } else if (*s != skip) { break; } } } /* Old IPv4-only format? */ if (!ext) { p[0] = 0; for (data = s; ; data++) { if (data == data_limit) return -1; c = *data; if (isdigit(c)) { p[i] = p[i]*10 + c - '0'; } else if (c == ',' && i < 5) { i++; p[i] = 0; } else { /* unexpected character or terminator */ break; } } if (i != 5) return -1; *start = s; *end = data; addr->ip = get_unaligned((__be32 *) p); *port = get_unaligned((__be16 *) (p + 4)); return 1; } if (s == data_limit) return -1; *start = s; edelim = *s++; if (edelim < 33 || edelim > 126) return -1; if (s == data_limit) return -1; if (*s == edelim) { /* Address family is usually missing for EPSV response */ if (mode != IP_VS_FTP_EPSV) return -1; s++; if (s == data_limit) return -1; /* Then address should be missing too */ if (*s != edelim) return -1; /* Caller can pre-set addr, if needed */ s++; } else { const char *ep; /* We allow address only from same family */ if (af == AF_INET6 && *s != '2') return -1; if (af == AF_INET && *s != '1') return -1; s++; if (s == data_limit) return -1; if (*s != edelim) return -1; s++; if (s == data_limit) return -1; if (af == AF_INET6) { if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, &ep) <= 0) return -1; } else { if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, &ep) <= 0) return -1; } s = (char *) ep; if (s == data_limit) return -1; if (*s != edelim) return -1; s++; } for (hport = 0; ; s++) { if (s == data_limit) return -1; if (!isdigit(*s)) break; hport = hport * 10 + *s - '0'; } if (s == data_limit || !hport || *s != edelim) return -1; s++; *end = s; *port = htons(hport); return 1; } /* Look at outgoing ftp packets to catch the response to a PASV/EPSV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, * client port 0 (unknown at the moment), the server address and the * server port. Mark the current connection entry as a control channel * of the new entry. All this work is just to make the data connection * can be scheduled to the right server later. * * The outgoing packet should be something like * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. * The extended format for EPSV response provides usually only port: * "229 Entering Extended Passive Mode (|||ppp|)" */ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct sk_buff *skb, int *diff, struct ip_vs_iphdr *ipvsh) { char *data, *data_limit; char *start, *end; union nf_inet_addr from; __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ unsigned int buf_len; int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; *diff = 0; /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; /* Linear packets are much easier to deal with. */ if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING_PASV, sizeof(SERVER_STRING_PASV)-1, '(', false, IP_VS_FTP_PASV, &from, &port, cp->af, &start, &end) != 1) return 1; IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", &from.ip, ntohs(port), &cp->caddr.ip, 0); } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; /* Usually, data address is not specified but * we support different address, so pre-set it. */ from = cp->daddr; if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING_EPSV, sizeof(SERVER_STRING_EPSV)-1, '(', true, IP_VS_FTP_EPSV, &from, &port, cp->af, &start, &end) != 1) return 1; IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); } else { return 1; } /* Now update or create a connection entry for it */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &from, port, &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &cp->caddr, 0, &cp->vaddr, port, &p); n_cp = ip_vs_conn_new(&p, cp->af, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); if (!n_cp) return 0; /* add its controller */ ip_vs_control_add(n_cp, cp); } /* Replace the old passive address with the new one */ if (cp->app_data == (void *) IP_VS_FTP_PASV) { from.ip = n_cp->vaddr.ip; port = n_cp->vport; snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", ((unsigned char *)&from.ip)[0], ((unsigned char *)&from.ip)[1], ((unsigned char *)&from.ip)[2], ((unsigned char *)&from.ip)[3], ntohs(port) >> 8, ntohs(port) & 0xFF); } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { from = n_cp->vaddr; port = n_cp->vport; /* Only port, client will use VIP for the data connection */ snprintf(buf, sizeof(buf), "|||%u|", ntohs(port)); } else { *buf = 0; } buf_len = strlen(buf); ct = nf_ct_get(skb, &ctinfo); if (ct) { bool mangled; /* If mangling fails this function will return 0 * which will cause the packet to be dropped. * Mangling can only fail under memory pressure, * hopefully it will succeed on the retransmitted * packet. */ mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, ipvsh->len, start - data, end - start, buf, buf_len); if (mangled) { ip_vs_nfct_expect_related(skb, ct, n_cp, ipvsh->protocol, 0, 0); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_UNNECESSARY; /* csum is updated */ ret = 1; } } /* Not setting 'diff' is intentional, otherwise the sequence * would be adjusted twice. */ cp->app_data = (void *) IP_VS_FTP_ACTIVE; ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return ret; } /* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command * (outside-to-inside). * * The incoming packet having the PORT command should be something like * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. * In this case, we create a connection entry using the client address and * port, so that the active ftp data connection from the server can reach * the client. * Extended format: * "EPSV\r\n" when client requests server address from same family * "EPSV 1\r\n" when client requests IPv4 server address * "EPSV 2\r\n" when client requests IPv6 server address * "EPSV ALL\r\n" - not supported * EPRT with specified delimiter (ASCII 33..126), "|" by default: * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport */ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, struct sk_buff *skb, int *diff, struct ip_vs_iphdr *ipvsh) { char *data, *data_start, *data_limit; char *start, *end; union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; /* no diff required for incoming packets */ *diff = 0; /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; /* Linear packets are much easier to deal with. */ if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); if (!data || data >= data_limit) return 1; while (data <= data_limit - 6) { if (cp->af == AF_INET && strncasecmp(data, "PASV\r\n", 6) == 0) { /* Passive mode on */ IP_VS_DBG(7, "got PASV at %td of %td\n", data - data_start, data_limit - data_start); cp->app_data = (void *) IP_VS_FTP_PASV; return 1; } /* EPSV or EPSV<space><net-prt> */ if (strncasecmp(data, "EPSV", 4) == 0 && (data[4] == ' ' || data[4] == '\r')) { if (data[4] == ' ') { char proto = data[5]; if (data > data_limit - 7 || data[6] != '\r') return 1; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && proto == '2') { } else #endif if (cp->af == AF_INET && proto == '1') { } else { return 1; } } /* Extended Passive mode on */ IP_VS_DBG(7, "got EPSV at %td of %td\n", data - data_start, data_limit - data_start); cp->app_data = (void *) IP_VS_FTP_EPSV; return 1; } data++; } /* * To support virtual FTP server, the scenerio is as follows: * FTP client ----> Load Balancer ----> FTP server * First detect the port number in the application data, * then create a new connection entry for the coming data * connection. */ if (cp->af == AF_INET && ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING_PORT, sizeof(CLIENT_STRING_PORT)-1, ' ', false, IP_VS_FTP_PORT, &to, &port, cp->af, &start, &end) == 1) { IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); /* Now update or create a connection entry for it */ IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", ip_vs_proto_name(ipvsh->protocol), &to.ip, ntohs(port), &cp->vaddr.ip, ntohs(cp->vport)-1); } else if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING_EPRT, sizeof(CLIENT_STRING_EPRT)-1, ' ', true, IP_VS_FTP_EPRT, &to, &port, cp->af, &start, &end) == 1) { IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); /* Now update or create a connection entry for it */ IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", ip_vs_proto_name(ipvsh->protocol), IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport)-1); } else { return 1; } /* Passive mode off */ cp->app_data = (void *) IP_VS_FTP_ACTIVE; { struct ip_vs_conn_param p; ip_vs_conn_fill_param(cp->ipvs, cp->af, ipvsh->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); if (!n_cp) return 0; /* add its controller */ ip_vs_control_add(n_cp, cp); } } /* * Move tunnel to listen state */ ip_vs_tcp_conn_listen(n_cp); ip_vs_conn_put(n_cp); return 1; } static struct ip_vs_app ip_vs_ftp = { .name = "ftp", .type = IP_VS_APP_TYPE_FTP, .protocol = IPPROTO_TCP, .module = THIS_MODULE, .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), .init_conn = ip_vs_ftp_init_conn, .done_conn = ip_vs_ftp_done_conn, .bind_conn = NULL, .unbind_conn = NULL, .pkt_out = ip_vs_ftp_out, .pkt_in = ip_vs_ftp_in, }; /* * per netns ip_vs_ftp initialization */ static int __net_init __ip_vs_ftp_init(struct net *net) { int i, ret; struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return -ENOENT; app = register_ip_vs_app(ipvs, &ip_vs_ftp); if (IS_ERR(app)) return PTR_ERR(app); for (i = 0; i < ports_count; i++) { if (!ports[i]) continue; ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; } return 0; err_unreg: unregister_ip_vs_app(ipvs, &ip_vs_ftp); return ret; } /* * netns exit */ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); } static struct pernet_operations ip_vs_ftp_ops = { .init = __ip_vs_ftp_init, .exit = __ip_vs_ftp_exit, }; static int __init ip_vs_ftp_init(void) { /* rcu_barrier() is called by netns on error */ return register_pernet_subsys(&ip_vs_ftp_ops); } /* * ip_vs_ftp finish. */ static void __exit ip_vs_ftp_exit(void) { unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs ftp helper");
19 19 19 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 // SPDX-License-Identifier: GPL-2.0 /* * Key setup facility for FS encryption support. * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #include <crypto/skcipher.h> #include <linux/random.h> #include "fscrypt_private.h" struct fscrypt_mode fscrypt_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, [FSCRYPT_MODE_AES_256_CTS] = { .friendly_name = "AES-256-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 32, .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, [FSCRYPT_MODE_AES_128_CTS] = { .friendly_name = "AES-128-CBC-CTS", .cipher_str = "cts(cbc(aes))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_SM4_XTS] = { .friendly_name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, }, [FSCRYPT_MODE_SM4_CTS] = { .friendly_name = "SM4-CBC-CTS", .cipher_str = "cts(cbc(sm4))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, [FSCRYPT_MODE_AES_256_HCTR2] = { .friendly_name = "AES-256-HCTR2", .cipher_str = "hctr2(aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1); if (S_ISREG(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); return ERR_PTR(-EINVAL); } /* Create a symmetric cipher object for the given encryption mode and key */ static struct crypto_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode) { struct crypto_skcipher *tfm; int err; tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fscrypt_warn(inode, "Missing crypto API support for %s (API name: \"%s\")", mode->friendly_name, mode->cipher_str); return ERR_PTR(-ENOPKG); } fscrypt_err(inode, "Error allocating '%s' transform: %ld", mode->cipher_str, PTR_ERR(tfm)); return tfm; } if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug * performance problems by logging the ->cra_driver_name the * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", mode->friendly_name, crypto_skcipher_driver_name(tfm)); } if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { err = -EINVAL; goto err_free_tfm; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); if (err) goto err_free_tfm; return tfm; err_free_tfm: crypto_free_skcipher(tfm); return ERR_PTR(err); } /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { struct crypto_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci->ci_mode->keysize, false, ci); tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->tfm with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. Note that this concurrency is only * possible for per-mode keys, not for per-file keys. */ smp_store_release(&prep_key->tfm, tfm); return 0; } /* Destroy a crypto transform object and/or blk-crypto key. */ void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); fscrypt_destroy_inline_crypt_key(sb, prep_key); memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key) { ci->ci_owns_key = true; return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; struct fscrypt_prepared_key *prep_key; u8 mode_key[FSCRYPT_MAX_RAW_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; bool use_hw_wrapped_key = false; int err; if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) { /* Using a hardware-wrapped key for file contents encryption */ if (!fscrypt_using_inline_encryption(ci)) { if (sb->s_flags & SB_INLINECRYPT) fscrypt_warn(ci->ci_inode, "Hardware-wrapped key required, but no suitable inline encryption capabilities are available"); else fscrypt_warn(ci->ci_inode, "Hardware-wrapped keys require inline encryption (-o inlinecrypt)"); return -EINVAL; } use_hw_wrapped_key = true; } prep_key = &keys[mode_num]; if (fscrypt_is_key_prepared(prep_key, ci)) { ci->ci_enc_key = *prep_key; return 0; } mutex_lock(&fscrypt_mode_key_setup_mutex); if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; if (use_hw_wrapped_key) { err = fscrypt_prepare_inline_crypt_key(prep_key, mk->mk_secret.bytes, mk->mk_secret.size, true, ci); if (err) goto out_unlock; goto done_unlock; } BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); BUILD_BUG_ON(sizeof(hkdf_info) != 17); hkdf_info[hkdf_infolen++] = mode_num; if (include_fs_uuid) { memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) goto out_unlock; err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) goto out_unlock; done_unlock: ci->ci_enc_key = *prep_key; err = 0; out_unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); return err; } /* * Derive a SipHash key from the given fscrypt master key and the given * application-specific information string. * * Note that the KDF produces a byte array, but the SipHash APIs expect the key * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, u8 context, const u8 *info, unsigned int infolen, siphash_key_t *key) { int err; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, (u8 *)key, sizeof(*key)); if (err) return err; BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); return 0; } int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { int err; err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, &ci->ci_dirhash_key); if (err) return err; ci->ci_dirhash_key_initialized = true; return 0; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { WARN_ON_ONCE(ci->ci_inode->i_ino == 0); WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized); ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, &mk->mk_ino_hash_key); } static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk) { int err; err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); if (err) return err; /* pairs with smp_store_release() below */ if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { mutex_lock(&fscrypt_mode_key_setup_mutex); if (mk->mk_ino_hash_key_initialized) goto unlock; err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, &mk->mk_ino_hash_key); if (err) goto unlock; /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); if (err) return err; } /* * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, bool need_dirhash_key) { int err; if (mk->mk_secret.is_hw_wrapped && !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) { fscrypt_warn(ci->ci_inode, "Hardware-wrapped keys are only supported with IV_INO_LBLK policies"); return -EINVAL; } if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* * DIRECT_KEY: instead of deriving per-file encryption keys, the * per-file nonce will be included in all the IVs. But unlike * v1 policies, for v2 policies in this case we don't encrypt * with the master key directly but rather derive a per-mode * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { /* * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline * encryption hardware compliant with the UFS standard. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE]; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, derived_key, ci->ci_mode->keysize); if (err) return err; err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } if (err) return err; /* Derive a secret dirhash key for directories that need it. */ if (need_dirhash_key) { err = fscrypt_derive_dirhash_key(ci, mk); if (err) return err; } return 0; } /* * Check whether the size of the given master key (@mk) is appropriate for the * encryption settings which a particular file will use (@ci). * * If the file uses a v1 encryption policy, then the master key must be at least * as long as the derived key, as this is a requirement of the v1 KDF. * * Otherwise, the KDF can accept any size key, so we enforce a slightly looser * requirement: we require that the size of the master key be at least the * maximum security strength of any algorithm whose key will be derived from it * (but in practice we only need to consider @ci->ci_mode, since any other * possible subkeys such as DIRHASH and INODE_HASH will never increase the * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be * derived from a 256-bit master key, which is cryptographically sufficient, * rather than requiring a 512-bit master key which is unnecessarily long. (We * still allow 512-bit master keys if the user chooses to use them, though.) */ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, const struct fscrypt_inode_info *ci) { unsigned int min_keysize; if (ci->ci_policy.version == FSCRYPT_POLICY_V1) min_keysize = ci->ci_mode->keysize; else min_keysize = ci->ci_mode->security_strength; if (mk->mk_secret.size < min_keysize) { fscrypt_warn(NULL, "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, mk->mk_secret.size, min_keysize); return false; } return true; } /* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then it is * returned in *mk_ret with its semaphore read-locked. This is needed to ensure * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes * (as multiple tasks may race to create an fscrypt_inode_info for the same * inode), and to synchronize the master key being removed with a new inode * starting to use it. */ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); if (unlikely(!mk)) { const union fscrypt_policy *dummy_policy = fscrypt_get_dummy_policy(sb); /* * Add the test_dummy_encryption key on-demand. In principle, * it should be added at mount time. Do it here instead so that * the individual filesystems don't need to worry about adding * this key at mount time and cleaning up on mount failure. */ if (dummy_policy && fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { err = fscrypt_add_test_dummy_key(sb, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); } } if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; err = fscrypt_select_encryption_impl(ci, false); if (err) return err; /* * As a legacy fallback for v1 policies, search for the key in * the current task's subscribed keyrings too. Don't move this * to before the search of ->s_master_keys, since users * shouldn't be able to override filesystem-level keys. */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } down_read(&mk->mk_sem); if (!mk->mk_present) { /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */ err = -ENOKEY; goto out_release_key; } if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped); if (err) goto out_release_key; switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: if (WARN_ON_ONCE(mk->mk_secret.is_hw_wrapped)) { /* * This should never happen, as adding a v1 policy key * that is hardware-wrapped isn't allowed. */ err = -EINVAL; goto out_release_key; } err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.bytes); break; case FSCRYPT_POLICY_V2: err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON_ONCE(1); err = -EINVAL; break; } if (err) goto out_release_key; *mk_ret = mk; return 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_inode_info *ci) { struct fscrypt_master_key *mk; if (!ci) return; if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, &ci->ci_enc_key); mk = ci->ci_master_key; if (mk) { /* * Remove this inode from the list of inodes that were unlocked * with the master key. In addition, if we're removing the last * inode from an incompletely removed key, then complete the * full removal of the key. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_inode_info_cachep, ci); } static int fscrypt_setup_encryption_info(struct inode *inode, const union fscrypt_policy *policy, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], bool need_dirhash_key) { struct fscrypt_inode_info *crypt_info; struct fscrypt_mode *mode; struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb); if (res) return res; crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; crypt_info->ci_policy = *policy; memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { res = PTR_ERR(mode); goto out; } WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; crypt_info->ci_data_unit_bits = fscrypt_policy_du_bits(&crypt_info->ci_policy, inode); crypt_info->ci_data_units_per_block_bits = inode->i_blkbits - crypt_info->ci_data_unit_bits; res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; /* * For existing inodes, multiple tasks may race to set ->i_crypt_info. * So use cmpxchg_release(). This pairs with the smp_load_acquire() in * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with * a RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { /* * We won the race and set ->i_crypt_info to our crypt_info. * Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); spin_unlock(&mk->mk_decrypted_inodes_lock); } crypt_info = NULL; } res = 0; out: if (mk) { up_read(&mk->mk_sem); fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; } /** * fscrypt_get_encryption_info() - set up an inode's encryption key * @inode: the inode to set up the key for. Must be encrypted. * @allow_unsupported: if %true, treat an unsupported encryption policy (or * unrecognized encryption context) the same way as the key * being unavailable, instead of returning an error. Use * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * * Set up ->i_crypt_info, if it hasn't already been done. * * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * * Return: 0 if ->i_crypt_info was set or was already set, *or* if the * encryption key is unavailable. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) { int res; union fscrypt_context ctx; union fscrypt_policy policy; if (fscrypt_has_encryption_key(inode)) return 0; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (res == -ERANGE && allow_unsupported) return 0; fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } res = fscrypt_policy_from_context(&policy, &ctx, res); if (res) { if (allow_unsupported) return 0; fscrypt_warn(inode, "Unrecognized or corrupt encryption context"); return res; } if (!fscrypt_supported_policy(&policy, inode)) { if (allow_unsupported) return 0; return -EINVAL; } res = fscrypt_setup_encryption_info(inode, &policy, fscrypt_context_nonce(&ctx), IS_CASEFOLDED(inode) && S_ISDIR(inode->i_mode)); if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */ res = 0; if (res == -ENOKEY) res = 0; return res; } /** * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory * @dir: a possibly-encrypted directory * @inode: the new inode. ->i_mode and ->i_blkbits must be set already. * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * * If the directory is encrypted, set up its ->i_crypt_info in preparation for * encrypting the name of the new file. Also, if the new inode will be * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino * isn't required to be set yet, as the filesystem may not have set it yet. * * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * * Return: 0 on success, -ENOKEY if the encryption key is missing, or another * -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { const union fscrypt_policy *policy; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) return 0; if (IS_ERR(policy)) return PTR_ERR(policy); if (WARN_ON_ONCE(inode->i_blkbits == 0)) return -EINVAL; if (WARN_ON_ONCE(inode->i_mode == 0)) return -EINVAL; /* * Only regular files, directories, and symlinks are encrypted. * Special files like device nodes and named pipes aren't. */ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return 0; *encrypt_ret = true; get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); return fscrypt_setup_encryption_info(inode, policy, nonce, IS_CASEFOLDED(dir) && S_ISDIR(inode->i_mode)); } EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); /** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * * Free the inode's fscrypt_inode_info. Filesystems must call this when the * inode is being evicted. An RCU grace period need not have elapsed yet. */ void fscrypt_put_encryption_info(struct inode *inode) { put_crypt_info(inode->i_crypt_info); inode->i_crypt_info = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); /** * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. */ void fscrypt_free_inode(struct inode *inode) { if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { kfree(inode->i_link); inode->i_link = NULL; } } EXPORT_SYMBOL(fscrypt_free_inode); /** * fscrypt_drop_inode() - check whether the inode's master key has been removed * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in * use and their master key has been removed. * * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0 */ int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode); /* * If ci is NULL, then the inode doesn't have an encryption key set up * so it's irrelevant. If ci_master_key is NULL, then the master key * was provided via the legacy mechanism of the process-subscribed * keyrings, so we don't know whether it's been removed or not. */ if (!ci || !ci->ci_master_key) return 0; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes * protected by the key were cleaned by sync_filesystem(). But if * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ if (inode->i_state & I_DIRTY_ALL) return 0; /* * We can't take ->mk_sem here, since this runs in atomic context. * Therefore, ->mk_present can change concurrently, and our result may * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ return !READ_ONCE(ci->ci_master_key->mk_present); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_COMMON_H #define _INET_COMMON_H #include <linux/indirect_call_wrapper.h> #include <linux/net.h> #include <linux/netdev_features.h> #include <linux/types.h> #include <net/sock.h> extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 */ struct msghdr; struct net; struct page; struct sock; struct sockaddr; struct socket; int inet_release(struct socket *sock); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); void inet_splice_eof(struct socket *sock); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) /* Skip CAP_NET_BIND_SERVICE check. */ #define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); static inline void inet_ctl_sock_destroy(struct sock *sk) { if (sk) sock_release(sk->sk_socket); } #define indirect_call_gro_receive(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_2(cb, f2, f1, head, skb); \ }) #endif
11 3 4 1 2 1 1 2 1 13 3 10 12 1 1 7 1 3 2 3 1 1 1 1 11 11 170 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_pedit.c Generic packet editor * * Authors: Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { [TCA_PEDIT_KEY_EX_HTYPE] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_HDR_TYPE_MAX), [TCA_PEDIT_KEY_EX_CMD] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_CMD_MAX), }; static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, u8 n, struct netlink_ext_ack *extack) { struct tcf_pedit_key_ex *keys_ex; struct tcf_pedit_key_ex *k; const struct nlattr *ka; int err = -EINVAL; int rem; if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); if (!keys_ex) return ERR_PTR(-ENOMEM); k = keys_ex; nla_for_each_nested(ka, nla, rem) { struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; if (!n) { NL_SET_ERR_MSG_MOD(extack, "Can't parse more extended keys than requested"); err = -EINVAL; goto err_out; } n--; if (nla_type(ka) != TCA_PEDIT_KEY_EX) { NL_SET_ERR_MSG_ATTR(extack, ka, "Unknown attribute, expected extended key"); err = -EINVAL; goto err_out; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, ka, pedit_key_ex_policy, NULL); if (err) goto err_out; if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_HTYPE)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_CMD)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); k++; } if (n) { NL_SET_ERR_MSG_MOD(extack, "Not enough extended keys to parse"); err = -EINVAL; goto err_out; } return keys_ex; err_out: kfree(keys_ex); return ERR_PTR(err); } static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { struct nlattr *keys_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) goto nla_failure; nla_nest_end(skb, key_start); keys_ex++; } nla_nest_end(skb, keys_start); return 0; nla_failure: nla_nest_cancel(skb, keys_start); return -EINVAL; } static void tcf_pedit_cleanup_rcu(struct rcu_head *head) { struct tcf_pedit_parms *parms = container_of(head, struct tcf_pedit_parms, rcu); kfree(parms->tcfp_keys_ex); kfree(parms->tcfp_keys); kfree(parms); } static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_pedit_parms *oparms, *nparms; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; int i, ksize; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; if (!pattr) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; } parm = nla_data(pattr); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { return err; } if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_release; } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); ret = -EINVAL; goto out_release; } nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) { ret = -ENOMEM; goto out_release; } nparms->tcfp_keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys, extack); if (IS_ERR(nparms->tcfp_keys_ex)) { ret = PTR_ERR(nparms->tcfp_keys_ex); goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; goto out_free_ex; } nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; nparms->tcfp_keys = kmemdup(parm->keys, ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 offmask = nparms->tcfp_keys[i].offmask; u32 cur = nparms->tcfp_keys[i].off; /* The AT option can be added to static offsets in the datapath */ if (!offmask && cur % 4) { NL_SET_ERR_MSG_MOD(extack, "Offsets must be on 32bit boundaries"); ret = -EINVAL; goto out_free_keys; } /* sanitize the shift value for any later use */ nparms->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ nparms->tcfp_off_max_hint = max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; out_free_keys: kfree(nparms->tcfp_keys); put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); out_release: tcf_idr_release(*a, bind); return ret; } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; parms = rcu_dereference_protected(p->parms, 1); if (parms) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) { if (offset > 0 && offset > skb->len) return false; if (offset < 0 && -offset > skb_headroom(skb)) return false; return true; } static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type) { const int noff = skb_network_offset(skb); int ret = -EINVAL; struct iphdr _iph; switch (skb->protocol) { case htons(ETH_P_IP): { const struct iphdr *iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph); if (!iph) goto out; *hoffset = noff + iph->ihl * 4; ret = 0; break; } case htons(ETH_P_IPV6): ret = ipv6_find_hdr(skb, hoffset, header_type, NULL, NULL) == header_type ? 0 : -EINVAL; break; } out: return ret; } static int pedit_skb_hdr_offset(struct sk_buff *skb, enum pedit_header_type htype, int *hoffset) { int ret = -EINVAL; /* 'htype' is validated in the netlink parsing */ switch (htype) { case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: if (skb_mac_header_was_set(skb)) { *hoffset = skb_mac_offset(skb); ret = 0; } break; case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: *hoffset = skb_network_offset(skb); ret = 0; break; case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_TCP); break; case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_UDP); break; default: break; } return ret; } TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_key_ex *tkey_ex; struct tcf_pedit_parms *parms; struct tc_pedit_key *tkey; u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) goto done; tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); tkey = parms->tcfp_keys; tkey_ex = parms->tcfp_keys_ex; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { int offset = tkey->off; int hoffset = 0; u32 *ptr, hdata; u32 val; int rc; if (tkey_ex) { htype = tkey_ex->htype; cmd = tkey_ex->cmd; tkey_ex++; } rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { pr_info_ratelimited("tc action pedit unable to extract header offset for header type (0x%x)\n", htype); goto bad; } if (tkey->offmask) { u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; if (offset % 4) { pr_info_ratelimited("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } } if (!offset_valid(skb, hoffset + offset)) { pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } ptr = skb_header_pointer(skb, hoffset + offset, sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: val = (*ptr + tkey->val) & ~tkey->mask; break; default: pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; bad: tcf_action_inc_overlimit_qstats(&p->common); done: return p->tcf_action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; spin_lock_bh(&p->tcf_lock); parms = rcu_dereference_protected(p->parms, 1); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (parms->tcfp_keys_ex) { if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; } else { if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; } static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; int k; for (k = 0; k < tcf_pedit_nkeys(act); k++) { switch (tcf_pedit_cmd(act, k)) { case TCA_PEDIT_KEY_EX_CMD_SET: entry->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: entry->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } entry->mangle.htype = tcf_pedit_htype(act, k); entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); entry->hw_stats = tc_act_hw_stats(act->hw_stats); entry++; } *index_inc = k; } else { struct flow_offload_action *fl_action = entry_data; u32 cmd = tcf_pedit_cmd(act, 0); int k; switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: fl_action->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: fl_action->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } for (k = 1; k < tcf_pedit_nkeys(act); k++) { if (cmd != tcf_pedit_cmd(act, k)) { NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } } } return 0; } static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; MODULE_ALIAS_NET_ACT("pedit"); static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_pedit_ops.net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, .id = &act_pedit_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); module_exit(pedit_cleanup_module);
36 11 4 4 2 2 24 1 12 11 24 3 14 13 27 29 29 2 27 27 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-or-later /* * vimc-debayer.c Virtual Media Controller Driver * * Copyright (C) 2015-2017 Helen Koike <helen.fornazier@gmail.com> */ #include <linux/moduleparam.h> #include <linux/platform_device.h> #include <linux/vmalloc.h> #include <linux/v4l2-mediabus.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/v4l2-subdev.h> #include "vimc-common.h" /* TODO: Add support for more output formats, we only support RGB888 for now. */ #define VIMC_DEBAYER_SOURCE_MBUS_FMT MEDIA_BUS_FMT_RGB888_1X24 enum vimc_debayer_rgb_colors { VIMC_DEBAYER_RED = 0, VIMC_DEBAYER_GREEN = 1, VIMC_DEBAYER_BLUE = 2, }; struct vimc_debayer_pix_map { u32 code; enum vimc_debayer_rgb_colors order[2][2]; }; struct vimc_debayer_device { struct vimc_ent_device ved; struct v4l2_subdev sd; struct v4l2_ctrl_handler hdl; struct media_pad pads[2]; u8 *src_frame; void (*set_rgb_src)(struct vimc_debayer_device *vdebayer, unsigned int lin, unsigned int col, unsigned int rgb[3]); /* * Virtual "hardware" configuration, filled when the stream starts or * when controls are set. */ struct { const struct vimc_debayer_pix_map *sink_pix_map; unsigned int sink_bpp; struct v4l2_area size; unsigned int mean_win_size; u32 src_code; } hw; }; static const struct v4l2_mbus_framefmt sink_fmt_default = { .width = 640, .height = 480, .code = MEDIA_BUS_FMT_SRGGB8_1X8, .field = V4L2_FIELD_NONE, .colorspace = V4L2_COLORSPACE_SRGB, }; static const u32 vimc_debayer_src_mbus_codes[] = { MEDIA_BUS_FMT_GBR888_1X24, MEDIA_BUS_FMT_BGR888_1X24, MEDIA_BUS_FMT_BGR888_3X8, MEDIA_BUS_FMT_RGB888_1X24, MEDIA_BUS_FMT_RGB888_2X12_BE, MEDIA_BUS_FMT_RGB888_2X12_LE, MEDIA_BUS_FMT_RGB888_3X8, MEDIA_BUS_FMT_RGB888_1X7X4_SPWG, MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA, MEDIA_BUS_FMT_RGB888_1X32_PADHI, }; static const struct vimc_debayer_pix_map vimc_debayer_pix_map_list[] = { { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR12_1X12, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG12_1X12, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG12_1X12, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB12_1X12, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, }; static const struct vimc_debayer_pix_map *vimc_debayer_pix_map_by_code(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_debayer_pix_map_list); i++) if (vimc_debayer_pix_map_list[i].code == code) return &vimc_debayer_pix_map_list[i]; return NULL; } static bool vimc_debayer_src_code_is_valid(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_debayer_src_mbus_codes); i++) if (vimc_debayer_src_mbus_codes[i] == code) return true; return false; } static int vimc_debayer_init_state(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state) { struct v4l2_mbus_framefmt *mf; mf = v4l2_subdev_state_get_format(sd_state, 0); *mf = sink_fmt_default; mf = v4l2_subdev_state_get_format(sd_state, 1); *mf = sink_fmt_default; mf->code = VIMC_DEBAYER_SOURCE_MBUS_FMT; return 0; } static int vimc_debayer_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_mbus_code_enum *code) { if (VIMC_IS_SRC(code->pad)) { if (code->index >= ARRAY_SIZE(vimc_debayer_src_mbus_codes)) return -EINVAL; code->code = vimc_debayer_src_mbus_codes[code->index]; } else { if (code->index >= ARRAY_SIZE(vimc_debayer_pix_map_list)) return -EINVAL; code->code = vimc_debayer_pix_map_list[code->index].code; } return 0; } static int vimc_debayer_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_frame_size_enum *fse) { if (fse->index) return -EINVAL; if (VIMC_IS_SINK(fse->pad)) { const struct vimc_debayer_pix_map *vpix = vimc_debayer_pix_map_by_code(fse->code); if (!vpix) return -EINVAL; } else if (!vimc_debayer_src_code_is_valid(fse->code)) { return -EINVAL; } fse->min_width = VIMC_FRAME_MIN_WIDTH; fse->max_width = VIMC_FRAME_MAX_WIDTH; fse->min_height = VIMC_FRAME_MIN_HEIGHT; fse->max_height = VIMC_FRAME_MAX_HEIGHT; return 0; } static void vimc_debayer_adjust_sink_fmt(struct v4l2_mbus_framefmt *fmt) { const struct vimc_debayer_pix_map *vpix; /* Don't accept a code that is not on the debayer table */ vpix = vimc_debayer_pix_map_by_code(fmt->code); if (!vpix) fmt->code = sink_fmt_default.code; fmt->width = clamp_t(u32, fmt->width, VIMC_FRAME_MIN_WIDTH, VIMC_FRAME_MAX_WIDTH) & ~1; fmt->height = clamp_t(u32, fmt->height, VIMC_FRAME_MIN_HEIGHT, VIMC_FRAME_MAX_HEIGHT) & ~1; if (fmt->field == V4L2_FIELD_ANY) fmt->field = sink_fmt_default.field; vimc_colorimetry_clamp(fmt); } static int vimc_debayer_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_format *fmt) { struct vimc_debayer_device *vdebayer = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *format; /* Do not change the format while stream is on. */ if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE && vdebayer->src_frame) return -EBUSY; /* * Do not change the format of the source pad, it is propagated from * the sink. */ if (VIMC_IS_SRC(fmt->pad)) return v4l2_subdev_get_fmt(sd, sd_state, fmt); /* Set the new format in the sink pad. */ vimc_debayer_adjust_sink_fmt(&fmt->format); format = v4l2_subdev_state_get_format(sd_state, 0); dev_dbg(vdebayer->ved.dev, "%s: sink format update: " "old:%dx%d (0x%x, %d, %d, %d, %d) " "new:%dx%d (0x%x, %d, %d, %d, %d)\n", vdebayer->sd.name, /* old */ format->width, format->height, format->code, format->colorspace, format->quantization, format->xfer_func, format->ycbcr_enc, /* new */ fmt->format.width, fmt->format.height, fmt->format.code, fmt->format.colorspace, fmt->format.quantization, fmt->format.xfer_func, fmt->format.ycbcr_enc); *format = fmt->format; /* Propagate the format to the source pad. */ format = v4l2_subdev_state_get_format(sd_state, 1); *format = fmt->format; format->code = VIMC_DEBAYER_SOURCE_MBUS_FMT; return 0; } static const struct v4l2_subdev_pad_ops vimc_debayer_pad_ops = { .enum_mbus_code = vimc_debayer_enum_mbus_code, .enum_frame_size = vimc_debayer_enum_frame_size, .get_fmt = v4l2_subdev_get_fmt, .set_fmt = vimc_debayer_set_fmt, }; static void vimc_debayer_process_rgb_frame(struct vimc_debayer_device *vdebayer, unsigned int lin, unsigned int col, unsigned int rgb[3]) { const struct vimc_pix_map *vpix; unsigned int i, index; vpix = vimc_pix_map_by_code(vdebayer->hw.src_code); index = VIMC_FRAME_INDEX(lin, col, vdebayer->hw.size.width, 3); for (i = 0; i < 3; i++) { switch (vpix->pixelformat) { case V4L2_PIX_FMT_RGB24: vdebayer->src_frame[index + i] = rgb[i]; break; case V4L2_PIX_FMT_BGR24: vdebayer->src_frame[index + i] = rgb[2 - i]; break; } } } static int vimc_debayer_s_stream(struct v4l2_subdev *sd, int enable) { struct vimc_debayer_device *vdebayer = v4l2_get_subdevdata(sd); if (enable) { const struct v4l2_mbus_framefmt *sink_fmt; const struct v4l2_mbus_framefmt *src_fmt; struct v4l2_subdev_state *state; const struct vimc_pix_map *vpix; unsigned int frame_size; if (vdebayer->src_frame) return 0; state = v4l2_subdev_lock_and_get_active_state(sd); sink_fmt = v4l2_subdev_state_get_format(state, 0); src_fmt = v4l2_subdev_state_get_format(state, 1); /* Calculate the frame size of the source pad */ vpix = vimc_pix_map_by_code(src_fmt->code); frame_size = src_fmt->width * src_fmt->height * vpix->bpp; /* Save the bytes per pixel of the sink */ vpix = vimc_pix_map_by_code(sink_fmt->code); vdebayer->hw.sink_bpp = vpix->bpp; /* Get the corresponding pixel map from the table */ vdebayer->hw.sink_pix_map = vimc_debayer_pix_map_by_code(sink_fmt->code); vdebayer->hw.size.width = sink_fmt->width; vdebayer->hw.size.height = sink_fmt->height; vdebayer->hw.src_code = src_fmt->code; v4l2_subdev_unlock_state(state); /* * Allocate the frame buffer. Use vmalloc to be able to * allocate a large amount of memory */ vdebayer->src_frame = vmalloc(frame_size); if (!vdebayer->src_frame) return -ENOMEM; } else { if (!vdebayer->src_frame) return 0; vfree(vdebayer->src_frame); vdebayer->src_frame = NULL; } return 0; } static const struct v4l2_subdev_core_ops vimc_debayer_core_ops = { .log_status = v4l2_ctrl_subdev_log_status, .subscribe_event = v4l2_ctrl_subdev_subscribe_event, .unsubscribe_event = v4l2_event_subdev_unsubscribe, }; static const struct v4l2_subdev_video_ops vimc_debayer_video_ops = { .s_stream = vimc_debayer_s_stream, }; static const struct v4l2_subdev_ops vimc_debayer_ops = { .core = &vimc_debayer_core_ops, .pad = &vimc_debayer_pad_ops, .video = &vimc_debayer_video_ops, }; static const struct v4l2_subdev_internal_ops vimc_debayer_internal_ops = { .init_state = vimc_debayer_init_state, }; static unsigned int vimc_debayer_get_val(const u8 *bytes, const unsigned int n_bytes) { unsigned int i; unsigned int acc = 0; for (i = 0; i < n_bytes; i++) acc = acc + (bytes[i] << (8 * i)); return acc; } static void vimc_debayer_calc_rgb_sink(struct vimc_debayer_device *vdebayer, const u8 *frame, const unsigned int lin, const unsigned int col, unsigned int rgb[3]) { unsigned int i, seek, wlin, wcol; unsigned int n_rgb[3] = {0, 0, 0}; for (i = 0; i < 3; i++) rgb[i] = 0; /* * Calculate how many we need to subtract to get to the pixel in * the top left corner of the mean window (considering the current * pixel as the center) */ seek = vdebayer->hw.mean_win_size / 2; /* Sum the values of the colors in the mean window */ dev_dbg(vdebayer->ved.dev, "deb: %s: --- Calc pixel %dx%d, window mean %d, seek %d ---\n", vdebayer->sd.name, lin, col, vdebayer->hw.size.height, seek); /* * Iterate through all the lines in the mean window, start * with zero if the pixel is outside the frame and don't pass * the height when the pixel is in the bottom border of the * frame */ for (wlin = seek > lin ? 0 : lin - seek; wlin < lin + seek + 1 && wlin < vdebayer->hw.size.height; wlin++) { /* * Iterate through all the columns in the mean window, start * with zero if the pixel is outside the frame and don't pass * the width when the pixel is in the right border of the * frame */ for (wcol = seek > col ? 0 : col - seek; wcol < col + seek + 1 && wcol < vdebayer->hw.size.width; wcol++) { enum vimc_debayer_rgb_colors color; unsigned int index; /* Check which color this pixel is */ color = vdebayer->hw.sink_pix_map->order[wlin % 2][wcol % 2]; index = VIMC_FRAME_INDEX(wlin, wcol, vdebayer->hw.size.width, vdebayer->hw.sink_bpp); dev_dbg(vdebayer->ved.dev, "deb: %s: RGB CALC: frame index %d, win pos %dx%d, color %d\n", vdebayer->sd.name, index, wlin, wcol, color); /* Get its value */ rgb[color] = rgb[color] + vimc_debayer_get_val(&frame[index], vdebayer->hw.sink_bpp); /* Save how many values we already added */ n_rgb[color]++; dev_dbg(vdebayer->ved.dev, "deb: %s: RGB CALC: val %d, n %d\n", vdebayer->sd.name, rgb[color], n_rgb[color]); } } /* Calculate the mean */ for (i = 0; i < 3; i++) { dev_dbg(vdebayer->ved.dev, "deb: %s: PRE CALC: %dx%d Color %d, val %d, n %d\n", vdebayer->sd.name, lin, col, i, rgb[i], n_rgb[i]); if (n_rgb[i]) rgb[i] = rgb[i] / n_rgb[i]; dev_dbg(vdebayer->ved.dev, "deb: %s: FINAL CALC: %dx%d Color %d, val %d\n", vdebayer->sd.name, lin, col, i, rgb[i]); } } static void *vimc_debayer_process_frame(struct vimc_ent_device *ved, const void *sink_frame) { struct vimc_debayer_device *vdebayer = container_of(ved, struct vimc_debayer_device, ved); unsigned int rgb[3]; unsigned int i, j; /* If the stream in this node is not active, just return */ if (!vdebayer->src_frame) return ERR_PTR(-EINVAL); for (i = 0; i < vdebayer->hw.size.height; i++) for (j = 0; j < vdebayer->hw.size.width; j++) { vimc_debayer_calc_rgb_sink(vdebayer, sink_frame, i, j, rgb); vdebayer->set_rgb_src(vdebayer, i, j, rgb); } return vdebayer->src_frame; } static int vimc_debayer_s_ctrl(struct v4l2_ctrl *ctrl) { struct vimc_debayer_device *vdebayer = container_of(ctrl->handler, struct vimc_debayer_device, hdl); switch (ctrl->id) { case VIMC_CID_MEAN_WIN_SIZE: vdebayer->hw.mean_win_size = ctrl->val; break; default: return -EINVAL; } return 0; } static const struct v4l2_ctrl_ops vimc_debayer_ctrl_ops = { .s_ctrl = vimc_debayer_s_ctrl, }; static void vimc_debayer_release(struct vimc_ent_device *ved) { struct vimc_debayer_device *vdebayer = container_of(ved, struct vimc_debayer_device, ved); v4l2_ctrl_handler_free(&vdebayer->hdl); v4l2_subdev_cleanup(&vdebayer->sd); media_entity_cleanup(vdebayer->ved.ent); kfree(vdebayer); } static const struct v4l2_ctrl_config vimc_debayer_ctrl_class = { .flags = V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY, .id = VIMC_CID_VIMC_CLASS, .name = "VIMC Controls", .type = V4L2_CTRL_TYPE_CTRL_CLASS, }; static const struct v4l2_ctrl_config vimc_debayer_ctrl_mean_win_size = { .ops = &vimc_debayer_ctrl_ops, .id = VIMC_CID_MEAN_WIN_SIZE, .name = "Debayer Mean Window Size", .type = V4L2_CTRL_TYPE_INTEGER, .min = 1, .max = 25, .step = 2, .def = 3, }; static struct vimc_ent_device *vimc_debayer_add(struct vimc_device *vimc, const char *vcfg_name) { struct v4l2_device *v4l2_dev = &vimc->v4l2_dev; struct vimc_debayer_device *vdebayer; int ret; /* Allocate the vdebayer struct */ vdebayer = kzalloc(sizeof(*vdebayer), GFP_KERNEL); if (!vdebayer) return ERR_PTR(-ENOMEM); /* Create controls: */ v4l2_ctrl_handler_init(&vdebayer->hdl, 2); v4l2_ctrl_new_custom(&vdebayer->hdl, &vimc_debayer_ctrl_class, NULL); v4l2_ctrl_new_custom(&vdebayer->hdl, &vimc_debayer_ctrl_mean_win_size, NULL); vdebayer->sd.ctrl_handler = &vdebayer->hdl; if (vdebayer->hdl.error) { ret = vdebayer->hdl.error; goto err_free_vdebayer; } /* Initialize ved and sd */ vdebayer->pads[0].flags = MEDIA_PAD_FL_SINK; vdebayer->pads[1].flags = MEDIA_PAD_FL_SOURCE; ret = vimc_ent_sd_register(&vdebayer->ved, &vdebayer->sd, v4l2_dev, vcfg_name, MEDIA_ENT_F_PROC_VIDEO_PIXEL_ENC_CONV, 2, vdebayer->pads, &vimc_debayer_internal_ops, &vimc_debayer_ops); if (ret) goto err_free_hdl; vdebayer->ved.process_frame = vimc_debayer_process_frame; vdebayer->ved.dev = vimc->mdev.dev; vdebayer->hw.mean_win_size = vimc_debayer_ctrl_mean_win_size.def; vdebayer->set_rgb_src = vimc_debayer_process_rgb_frame; return &vdebayer->ved; err_free_hdl: v4l2_ctrl_handler_free(&vdebayer->hdl); err_free_vdebayer: kfree(vdebayer); return ERR_PTR(ret); } const struct vimc_ent_type vimc_debayer_type = { .add = vimc_debayer_add, .release = vimc_debayer_release };
2 1 2 2 2 5 5 1 3 28 28 3 1 57 57 29 32 28 4 28 4 28 55 24 29 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-remote.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file contains functions for initializing the input-device and for handling remote-control-queries. */ #include "dvb-usb-common.h" #include <linux/usb/input.h> static unsigned int legacy_dvb_usb_get_keymap_index(const struct input_keymap_entry *ke, struct rc_map_table *keymap, unsigned int keymap_size) { unsigned int index; unsigned int scancode; if (ke->flags & INPUT_KEYMAP_BY_INDEX) { index = ke->index; } else { if (input_scancode_to_scalar(ke, &scancode)) return keymap_size; /* See if we can match the raw key code. */ for (index = 0; index < keymap_size; index++) if (keymap[index].scancode == scancode) break; /* See if there is an unused hole in the map */ if (index >= keymap_size) { for (index = 0; index < keymap_size; index++) { if (keymap[index].keycode == KEY_RESERVED || keymap[index].keycode == KEY_UNKNOWN) { break; } } } } return index; } static int legacy_dvb_usb_getkeycode(struct input_dev *dev, struct input_keymap_entry *ke) { struct dvb_usb_device *d = input_get_drvdata(dev); struct rc_map_table *keymap = d->props.rc.legacy.rc_map_table; unsigned int keymap_size = d->props.rc.legacy.rc_map_size; unsigned int index; index = legacy_dvb_usb_get_keymap_index(ke, keymap, keymap_size); if (index >= keymap_size) return -EINVAL; ke->keycode = keymap[index].keycode; if (ke->keycode == KEY_UNKNOWN) ke->keycode = KEY_RESERVED; ke->len = sizeof(keymap[index].scancode); memcpy(&ke->scancode, &keymap[index].scancode, ke->len); ke->index = index; return 0; } static int legacy_dvb_usb_setkeycode(struct input_dev *dev, const struct input_keymap_entry *ke, unsigned int *old_keycode) { struct dvb_usb_device *d = input_get_drvdata(dev); struct rc_map_table *keymap = d->props.rc.legacy.rc_map_table; unsigned int keymap_size = d->props.rc.legacy.rc_map_size; unsigned int index; index = legacy_dvb_usb_get_keymap_index(ke, keymap, keymap_size); /* * FIXME: Currently, it is not possible to increase the size of * scancode table. For it to happen, one possibility * would be to allocate a table with key_map_size + 1, * copying data, appending the new key on it, and freeing * the old one - or maybe just allocating some spare space */ if (index >= keymap_size) return -EINVAL; *old_keycode = keymap[index].keycode; keymap->keycode = ke->keycode; __set_bit(ke->keycode, dev->keybit); if (*old_keycode != KEY_RESERVED) { __clear_bit(*old_keycode, dev->keybit); for (index = 0; index < keymap_size; index++) { if (keymap[index].keycode == *old_keycode) { __set_bit(*old_keycode, dev->keybit); break; } } } return 0; } /* Remote-control poll function - called every dib->rc_query_interval ms to see * whether the remote control has received anything. * * TODO: Fix the repeat rate of the input device. */ static void legacy_dvb_usb_read_remote_control(struct work_struct *work) { struct dvb_usb_device *d = container_of(work, struct dvb_usb_device, rc_query_work.work); u32 event; int state; /* TODO: need a lock here. We can simply skip checking for the remote control if we're busy. */ /* when the parameter has been set to 1 via sysfs while the driver was running */ if (dvb_usb_disable_rc_polling) return; if (d->props.rc.legacy.rc_query(d,&event,&state)) { err("error while querying for an remote control event."); goto schedule; } switch (state) { case REMOTE_NO_KEY_PRESSED: break; case REMOTE_KEY_PRESSED: deb_rc("key pressed\n"); d->last_event = event; input_event(d->input_dev, EV_KEY, event, 1); input_sync(d->input_dev); input_event(d->input_dev, EV_KEY, d->last_event, 0); input_sync(d->input_dev); break; case REMOTE_KEY_REPEAT: deb_rc("key repeated\n"); input_event(d->input_dev, EV_KEY, event, 1); input_sync(d->input_dev); input_event(d->input_dev, EV_KEY, d->last_event, 0); input_sync(d->input_dev); break; default: break; } /* improved repeat handling ??? switch (state) { case REMOTE_NO_KEY_PRESSED: deb_rc("NO KEY PRESSED\n"); if (d->last_state != REMOTE_NO_KEY_PRESSED) { deb_rc("releasing event %d\n",d->last_event); input_event(d->rc_input_dev, EV_KEY, d->last_event, 0); input_sync(d->rc_input_dev); } d->last_state = REMOTE_NO_KEY_PRESSED; d->last_event = 0; break; case REMOTE_KEY_PRESSED: deb_rc("KEY PRESSED\n"); deb_rc("pressing event %d\n",event); input_event(d->rc_input_dev, EV_KEY, event, 1); input_sync(d->rc_input_dev); d->last_event = event; d->last_state = REMOTE_KEY_PRESSED; break; case REMOTE_KEY_REPEAT: deb_rc("KEY_REPEAT\n"); if (d->last_state != REMOTE_NO_KEY_PRESSED) { deb_rc("repeating event %d\n",d->last_event); input_event(d->rc_input_dev, EV_KEY, d->last_event, 2); input_sync(d->rc_input_dev); d->last_state = REMOTE_KEY_REPEAT; } default: break; } */ schedule: schedule_delayed_work(&d->rc_query_work,msecs_to_jiffies(d->props.rc.legacy.rc_interval)); } static int legacy_dvb_usb_remote_init(struct dvb_usb_device *d) { int i, err, rc_interval; struct input_dev *input_dev; input_dev = input_allocate_device(); if (!input_dev) return -ENOMEM; input_dev->evbit[0] = BIT_MASK(EV_KEY); input_dev->name = "IR-receiver inside an USB DVB receiver"; input_dev->phys = d->rc_phys; usb_to_input_id(d->udev, &input_dev->id); input_dev->dev.parent = &d->udev->dev; d->input_dev = input_dev; d->rc_dev = NULL; input_dev->getkeycode = legacy_dvb_usb_getkeycode; input_dev->setkeycode = legacy_dvb_usb_setkeycode; /* set the bits for the keys */ deb_rc("key map size: %d\n", d->props.rc.legacy.rc_map_size); for (i = 0; i < d->props.rc.legacy.rc_map_size; i++) { deb_rc("setting bit for event %d item %d\n", d->props.rc.legacy.rc_map_table[i].keycode, i); set_bit(d->props.rc.legacy.rc_map_table[i].keycode, input_dev->keybit); } /* setting these two values to non-zero, we have to manage key repeats */ input_dev->rep[REP_PERIOD] = d->props.rc.legacy.rc_interval; input_dev->rep[REP_DELAY] = d->props.rc.legacy.rc_interval + 150; input_set_drvdata(input_dev, d); err = input_register_device(input_dev); if (err) input_free_device(input_dev); rc_interval = d->props.rc.legacy.rc_interval; INIT_DELAYED_WORK(&d->rc_query_work, legacy_dvb_usb_read_remote_control); info("schedule remote query interval to %d msecs.", rc_interval); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(rc_interval)); d->state |= DVB_USB_STATE_REMOTE; return err; } /* Remote-control poll function - called every dib->rc_query_interval ms to see * whether the remote control has received anything. * * TODO: Fix the repeat rate of the input device. */ static void dvb_usb_read_remote_control(struct work_struct *work) { struct dvb_usb_device *d = container_of(work, struct dvb_usb_device, rc_query_work.work); int err; /* TODO: need a lock here. We can simply skip checking for the remote control if we're busy. */ /* when the parameter has been set to 1 via sysfs while the * driver was running, or when bulk mode is enabled after IR init */ if (dvb_usb_disable_rc_polling || d->props.rc.core.bulk_mode) return; err = d->props.rc.core.rc_query(d); if (err) err("error %d while querying for an remote control event.", err); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(d->props.rc.core.rc_interval)); } static int rc_core_dvb_usb_remote_init(struct dvb_usb_device *d) { int err, rc_interval; struct rc_dev *dev; dev = rc_allocate_device(d->props.rc.core.driver_type); if (!dev) return -ENOMEM; dev->driver_name = d->props.rc.core.module_name; dev->map_name = d->props.rc.core.rc_codes; dev->change_protocol = d->props.rc.core.change_protocol; dev->allowed_protocols = d->props.rc.core.allowed_protos; usb_to_input_id(d->udev, &dev->input_id); dev->device_name = d->desc->name; dev->input_phys = d->rc_phys; dev->dev.parent = &d->udev->dev; dev->priv = d; dev->scancode_mask = d->props.rc.core.scancode_mask; err = rc_register_device(dev); if (err < 0) { rc_free_device(dev); return err; } d->input_dev = NULL; d->rc_dev = dev; if (!d->props.rc.core.rc_query || d->props.rc.core.bulk_mode) return 0; /* Polling mode - initialize a work queue for handling it */ INIT_DELAYED_WORK(&d->rc_query_work, dvb_usb_read_remote_control); rc_interval = d->props.rc.core.rc_interval; info("schedule remote query interval to %d msecs.", rc_interval); schedule_delayed_work(&d->rc_query_work, msecs_to_jiffies(rc_interval)); return 0; } int dvb_usb_remote_init(struct dvb_usb_device *d) { int err; if (dvb_usb_disable_rc_polling) return 0; if (d->props.rc.legacy.rc_map_table && d->props.rc.legacy.rc_query) d->props.rc.mode = DVB_RC_LEGACY; else if (d->props.rc.core.rc_codes) d->props.rc.mode = DVB_RC_CORE; else return 0; usb_make_path(d->udev, d->rc_phys, sizeof(d->rc_phys)); strlcat(d->rc_phys, "/ir0", sizeof(d->rc_phys)); /* Start the remote-control polling. */ if (d->props.rc.legacy.rc_interval < 40) d->props.rc.legacy.rc_interval = 100; /* default */ if (d->props.rc.mode == DVB_RC_LEGACY) err = legacy_dvb_usb_remote_init(d); else err = rc_core_dvb_usb_remote_init(d); if (err) return err; d->state |= DVB_USB_STATE_REMOTE; return 0; } int dvb_usb_remote_exit(struct dvb_usb_device *d) { if (d->state & DVB_USB_STATE_REMOTE) { cancel_delayed_work_sync(&d->rc_query_work); if (d->props.rc.mode == DVB_RC_LEGACY) input_unregister_device(d->input_dev); else rc_unregister_device(d->rc_dev); } d->state &= ~DVB_USB_STATE_REMOTE; return 0; } #define DVB_USB_RC_NEC_EMPTY 0x00 #define DVB_USB_RC_NEC_KEY_PRESSED 0x01 #define DVB_USB_RC_NEC_KEY_REPEATED 0x02 int dvb_usb_nec_rc_key_to_event(struct dvb_usb_device *d, u8 keybuf[5], u32 *event, int *state) { int i; struct rc_map_table *keymap = d->props.rc.legacy.rc_map_table; *event = 0; *state = REMOTE_NO_KEY_PRESSED; switch (keybuf[0]) { case DVB_USB_RC_NEC_EMPTY: break; case DVB_USB_RC_NEC_KEY_PRESSED: if ((u8) ~keybuf[1] != keybuf[2] || (u8) ~keybuf[3] != keybuf[4]) { deb_err("remote control checksum failed.\n"); break; } /* See if we can match the raw key code. */ for (i = 0; i < d->props.rc.legacy.rc_map_size; i++) if (rc5_custom(&keymap[i]) == keybuf[1] && rc5_data(&keymap[i]) == keybuf[3]) { *event = keymap[i].keycode; *state = REMOTE_KEY_PRESSED; return 0; } deb_err("key mapping failed - no appropriate key found in keymapping\n"); break; case DVB_USB_RC_NEC_KEY_REPEATED: *state = REMOTE_KEY_REPEAT; break; default: deb_err("unknown type of remote status: %d\n",keybuf[0]); break; } return 0; } EXPORT_SYMBOL(dvb_usb_nec_rc_key_to_event);
63 50 1770 534 535 40 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/pid.h> #include <linux/posix-timers_types.h> #include <linux/rcuref.h> #include <linux/spinlock.h> #include <linux/timerqueue.h> struct kernel_siginfo; struct task_struct; struct sigqueue; struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { return ((~pid) << 3) | clock; } static inline clockid_t make_thread_cpuclock(const unsigned int tid, const clockid_t clock) { return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK); } static inline clockid_t fd_to_clockid(const int fd) { return make_process_cpuclock((unsigned int) fd, CLOCKFD); } static inline int clockid_to_fd(const clockid_t clk) { return ~(clk >> 3); } #ifdef CONFIG_POSIX_TIMERS #include <linux/signal_types.h> /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing * @nanosleep: Timer is used for nanosleep and is not a regular posix-timer * @handling: Pointer to the task which handles expiry */ struct cpu_timer { struct timerqueue_node node; struct timerqueue_head *head; struct pid *pid; struct list_head elist; bool firing; bool nanosleep; struct task_struct __rcu *handling; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, struct cpu_timer *ctmr) { ctmr->head = head; return timerqueue_add(head, &ctmr->node); } static inline bool cpu_timer_queued(struct cpu_timer *ctmr) { return !!ctmr->head; } static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; } return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) { return ctmr->node.expires; } static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) { ctmr->node.expires = exp; } static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); pct->bases[0].nextevt = U64_MAX; pct->bases[1].nextevt = U64_MAX; pct->bases[2].nextevt = U64_MAX; } void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit); static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, u64 runtime) { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); long posixtimer_create_prctl(unsigned long ctrl); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ .nextevt = U64_MAX, \ } #define INIT_CPU_TIMERBASES(b) { \ INIT_CPU_TIMERBASE(b[0]), \ INIT_CPU_TIMERBASE(b[1]), \ INIT_CPU_TIMERBASE(b[2]), \ } #define INIT_CPU_TIMERS(s) \ .posix_cputimers = { \ .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK void clear_posix_cputimers_work(struct task_struct *p); void posix_cputimers_init_work(void); #else static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List node for binding the timer to tsk::signal::posix_timers * @ignored_list: List node for tracking ignored timers in tsk::signal::ignored_posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_status: The status of the timer * @it_sig_periodic: The periodic status at signal delivery * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery * @it_sigqueue_seq: The sequence count at the point where the signal was queued * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @rcuref: Reference count for life time management * @sigq: Embedded sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. */ struct k_itimer { /* 1st cacheline contains read-mostly fields */ struct hlist_node t_hash; struct hlist_node list; timer_t it_id; clockid_t it_clock; int it_sigev_notify; enum pid_type it_pid_type; struct signal_struct *it_signal; const struct k_clock *kclock; /* 2nd cacheline and above contain fields which are modified regularly */ spinlock_t it_lock; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; ktime_t it_interval; struct hlist_node ignored_list; union { struct pid *it_pid; struct task_struct *it_process; }; struct sigqueue sigq; rcuref_t rcuref; union { struct { struct hrtimer timer; } real; struct cpu_timer cpu; struct { struct alarm alarmtimer; } alarm; } it; struct rcu_head rcu; } ____cacheline_aligned_in_smp; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); #ifdef CONFIG_POSIX_TIMERS static inline void posixtimer_putref(struct k_itimer *tmr) { if (rcuref_put(&tmr->rcuref)) posixtimer_free_timer(tmr); } static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); WARN_ON_ONCE(!rcuref_get(&tmr->rcuref)); } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); posixtimer_putref(tmr); } static inline bool posixtimer_valid(const struct k_itimer *timer) { unsigned long val = (unsigned long)timer->it_signal; return !(val & 0x1UL); } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } #endif /* !CONFIG_POSIX_TIMERS */ #endif
2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC packet reception * * Copyright (C) 2007, 2016, 2022 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "ar-internal.h" static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); /* * handle data received on the local endpoint * - may be called in interrupt context * * [!] Note that as this is called from the encap_rcv hook, the socket is not * held locked by the caller and nothing prevents sk_user_data on the UDP from * being cleared in the middle of processing this function. * * Called with the RCU read lock held from the IP layer via UDP. */ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); struct task_struct *io_thread; if (unlikely(!local)) { kfree_skb(skb); return 0; } io_thread = READ_ONCE(local->io_thread); if (!io_thread) { kfree_skb(skb); return 0; } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); skb->mark = RXRPC_SKB_MARK_PACKET; rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); rx_queue = &local->rx_queue; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY if (rxrpc_inject_rx_delay || !skb_queue_empty(&local->rx_delay_queue)) { skb->tstamp = ktime_add_ms(skb->tstamp, rxrpc_inject_rx_delay); rx_queue = &local->rx_delay_queue; } #endif skb_queue_tail(rx_queue, skb); wake_up_process(io_thread); return 0; } /* * Handle an error received on the local endpoint. */ void rxrpc_error_report(struct sock *sk) { struct rxrpc_local *local; struct sk_buff *skb; rcu_read_lock(); local = rcu_dereference_sk_user_data(sk); if (unlikely(!local)) { rcu_read_unlock(); return; } while ((skb = skb_dequeue(&sk->sk_error_queue))) { skb->mark = RXRPC_SKB_MARK_ERROR; rxrpc_new_skb(skb, rxrpc_skb_new_error_report); skb_queue_tail(&local->rx_queue, skb); } rxrpc_wake_up_io_thread(local); rcu_read_unlock(); } /* * Directly produce an abort from a packet. */ bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, abort_code, err); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = abort_code; return false; } static bool rxrpc_bad_message(struct sk_buff *skb, enum rxrpc_abort_reason why) { return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EBADMSG); } #define just_discard true /* * Process event packets targeted at a local endpoint. */ static bool rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); char v; _enter(""); rxrpc_see_skb(skb, rxrpc_skb_see_version); if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &v, 1) >= 0) { if (v == 0) rxrpc_send_version_request(local, &sp->hdr, skb); } return true; } /* * Extract the wire header from a packet and translate the byte order. */ static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) { struct rxrpc_wire_header whdr; struct rxrpc_ackpacket ack; /* dig out the RxRPC connection details */ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) return rxrpc_bad_message(skb, rxrpc_badmsg_short_hdr); memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); sp->hdr.cid = ntohl(whdr.cid); sp->hdr.callNumber = ntohl(whdr.callNumber); sp->hdr.seq = ntohl(whdr.seq); sp->hdr.serial = ntohl(whdr.serial); sp->hdr.flags = whdr.flags; sp->hdr.type = whdr.type; sp->hdr.userStatus = whdr.userStatus; sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) { if (skb_copy_bits(skb, sizeof(whdr), &ack, sizeof(ack)) < 0) return rxrpc_bad_message(skb, rxrpc_badmsg_short_ack); sp->ack.first_ack = ntohl(ack.firstPacket); sp->ack.prev_ack = ntohl(ack.previousPacket); sp->ack.acked_serial = ntohl(ack.serial); sp->ack.reason = ack.reason; sp->ack.nr_acks = ack.nAcks; } return true; } /* * Extract the abort code from an ABORT packet and stash it in skb->priority. */ static bool rxrpc_extract_abort(struct sk_buff *skb) { __be32 wtmp; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &wtmp, sizeof(wtmp)) < 0) return false; skb->priority = ntohl(wtmp); return true; } /* * Process packets received on the local endpoint */ static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; struct sockaddr_rxrpc peer_srx; struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; struct sk_buff *skb = *_skb; bool ret = false; skb_pull(skb, sizeof(struct udphdr)); sp = rxrpc_skb(skb); /* dig out the RxRPC connection details */ if (!rxrpc_extract_header(sp, skb)) return just_discard; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); return just_discard; } } trace_rxrpc_rx_packet(sp); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) return just_discard; return rxrpc_input_version(local, skb); case RXRPC_PACKET_TYPE_BUSY: if (rxrpc_to_server(sp)) return just_discard; fallthrough; case RXRPC_PACKET_TYPE_ACK: case RXRPC_PACKET_TYPE_ACKALL: if (sp->hdr.callNumber == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); break; case RXRPC_PACKET_TYPE_ABORT: if (!rxrpc_extract_abort(skb)) return just_discard; /* Just discard if malformed */ break; case RXRPC_PACKET_TYPE_DATA: if (sp->hdr.callNumber == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); if (sp->hdr.seq == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_seq); /* Unshare the packet so that it can be modified for in-place * decryption. */ if (sp->hdr.securityIndex != 0) { skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem); *_skb = NULL; return just_discard; } if (skb != *_skb) { rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare); *_skb = skb; rxrpc_new_skb(skb, rxrpc_skb_new_unshared); sp = rxrpc_skb(skb); } } break; case RXRPC_PACKET_TYPE_CHALLENGE: if (rxrpc_to_server(sp)) return just_discard; break; case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_to_client(sp)) return just_discard; break; /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: return just_discard; default: return rxrpc_bad_message(skb, rxrpc_badmsg_unsupported_packet); } if (sp->hdr.serviceId == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_service); if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0)) return just_discard; /* Unsupported address type. */ if (peer_srx.transport.family != local->srx.transport.family && (peer_srx.transport.family == AF_INET && local->srx.transport.family != AF_INET6)) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", peer_srx.transport.family, local->srx.transport.family); return just_discard; /* Wrong address type. */ } if (rxrpc_to_client(sp)) { rcu_read_lock(); conn = rxrpc_find_client_connection_rcu(local, &peer_srx, skb); conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); rcu_read_unlock(); if (!conn) return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_conn); ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); rxrpc_put_connection(conn, rxrpc_conn_put_call_input); return ret; } /* We need to look up service connections by the full protocol * parameter set. We look up the peer first as an intermediate step * and then the connection from the peer's tree. */ rcu_read_lock(); peer = rxrpc_lookup_peer_rcu(local, &peer_srx); if (!peer) { rcu_read_unlock(); return rxrpc_new_incoming_call(local, NULL, NULL, &peer_srx, skb); } conn = rxrpc_find_service_conn_rcu(peer, skb); conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); if (conn) { rcu_read_unlock(); ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); rxrpc_put_connection(conn, rxrpc_conn_put_call_input); return ret; } peer = rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input); rcu_read_unlock(); ret = rxrpc_new_incoming_call(local, peer, NULL, &peer_srx, skb); rxrpc_put_peer(peer, rxrpc_peer_put_input); return ret; } /* * Deal with a packet that's associated with an extant connection. */ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, RXKADINCONSISTENCY, -EBADMSG); if (sp->hdr.serviceId != conn->service_id) { int old_id; if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) return rxrpc_protocol_error(skb, rxrpc_eproto_reupgrade); old_id = cmpxchg(&conn->service_id, conn->orig_service_id, sp->hdr.serviceId); if (old_id != conn->orig_service_id && old_id != sp->hdr.serviceId) return rxrpc_protocol_error(skb, rxrpc_eproto_bad_upgrade); } if (after(sp->hdr.serial, conn->hi_serial)) conn->hi_serial = sp->hdr.serial; /* It's a connection-level packet if the call number is 0. */ if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); /* Deal with path MTU discovery probing. */ if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && conn->pmtud_probe && after_eq(sp->ack.acked_serial, conn->pmtud_probe)) rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) return just_discard; if (sp->hdr.callNumber == chan->last_call) { if (chan->call || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) return just_discard; /* For the previous service call, if completed successfully, we * discard all further packets. */ if (rxrpc_conn_is_service(conn) && chan->last_type == RXRPC_PACKET_TYPE_ACK) return just_discard; /* But otherwise we need to retransmit the final packet from * data cached in the connection record. */ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) trace_rxrpc_rx_data(chan->call_debug_id, sp->hdr.seq, sp->hdr.serial, sp->hdr.flags); rxrpc_conn_retransmit_call(conn, skb, channel); return just_discard; } call = rxrpc_try_get_call(chan->call, rxrpc_call_get_input); if (sp->hdr.callNumber > chan->call_id) { if (rxrpc_to_client(sp)) { rxrpc_put_call(call, rxrpc_call_put_input); return rxrpc_protocol_error(skb, rxrpc_eproto_unexpected_implicit_end); } if (call) { rxrpc_implicit_end_call(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); call = NULL; } } if (!call) { if (rxrpc_to_client(sp)) return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_call); return rxrpc_new_incoming_call(conn->local, conn->peer, conn, peer_srx, skb); } rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; } /* * I/O and event handling thread. */ int rxrpc_io_thread(void *data) { struct rxrpc_connection *conn; struct sk_buff_head rx_queue; struct rxrpc_local *local = data; struct rxrpc_call *call; struct sk_buff *skb; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY ktime_t now; #endif bool should_stop; LIST_HEAD(conn_attend_q); LIST_HEAD(call_attend_q); complete(&local->io_thread_ready); skb_queue_head_init(&rx_queue); set_user_nice(current, MIN_NICE); for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); /* Inject a delay into packets if requested. */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY now = ktime_get_real(); while ((skb = skb_peek(&local->rx_delay_queue))) { if (ktime_before(now, skb->tstamp)) break; skb = skb_dequeue(&local->rx_delay_queue); skb_queue_tail(&local->rx_queue, skb); } #endif if (!skb_queue_empty(&local->rx_queue)) { spin_lock_irq(&local->rx_queue.lock); skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); spin_unlock_irq(&local->rx_queue.lock); trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue)); } /* Distribute packets and errors. */ while ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; if (!rxrpc_input_packet(local, &skb)) rxrpc_reject_packet(local, skb); trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_free_skb(skb, rxrpc_skb_put_input); break; case RXRPC_SKB_MARK_ERROR: rxrpc_input_error(local, skb); rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: rxrpc_input_conn_event(sp->poke_conn, skb); rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: WARN_ON_ONCE(1); rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } } /* Deal with connections that want immediate attention. */ if (!list_empty_careful(&local->conn_attend_q)) { spin_lock_irq(&local->lock); list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); spin_unlock_irq(&local->lock); } while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, attend_link))) { spin_lock_irq(&local->lock); list_del_init(&conn->attend_link); spin_unlock_irq(&local->lock); rxrpc_input_conn_event(conn, NULL); rxrpc_put_connection(conn, rxrpc_conn_put_poke); } if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) rxrpc_discard_expired_client_conns(local); /* Deal with calls that want immediate attention. */ spin_lock_irq(&local->lock); list_splice_tail_init(&local->call_attend_q, &call_attend_q); spin_unlock_irq(&local->lock); while ((call = list_first_entry_or_null(&call_attend_q, struct rxrpc_call, attend_link))) { spin_lock_irq(&local->lock); list_del_init(&call->attend_link); spin_unlock_irq(&local->lock); trace_rxrpc_call_poked(call); rxrpc_input_call_event(call); rxrpc_put_call(call, rxrpc_call_put_poke); } if (!list_empty(&local->new_client_calls)) rxrpc_connect_client_calls(local); set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || !list_empty(&local->call_attend_q) || !list_empty(&local->conn_attend_q) || !list_empty(&local->new_client_calls) || test_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) { __set_current_state(TASK_RUNNING); continue; } if (should_stop) break; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY skb = skb_peek(&local->rx_delay_queue); if (skb) { unsigned long timeout; ktime_t tstamp = skb->tstamp; ktime_t now = ktime_get_real(); s64 delay_ns = ktime_to_ns(ktime_sub(tstamp, now)); if (delay_ns <= 0) { __set_current_state(TASK_RUNNING); continue; } timeout = nsecs_to_jiffies(delay_ns); timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; } #endif schedule(); } __set_current_state(TASK_RUNNING); rxrpc_see_local(local, rxrpc_local_stop); rxrpc_destroy_local(local); WRITE_ONCE(local->io_thread, NULL); rxrpc_see_local(local, rxrpc_local_stopped); return 0; }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 /* SPDX-License-Identifier: GPL-2.0-or-later * * Copyright (C) 2005 David Brownell */ #ifndef __LINUX_SPI_H #define __LINUX_SPI_H #include <linux/acpi.h> #include <linux/bits.h> #include <linux/completion.h> #include <linux/device.h> #include <linux/gpio/consumer.h> #include <linux/kthread.h> #include <linux/mod_devicetable.h> #include <linux/overflow.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/u64_stats_sync.h> #include <uapi/linux/spi/spi.h> /* Max no. of CS supported per spi device */ #define SPI_CS_CNT_MAX 16 struct dma_chan; struct software_node; struct ptp_system_timestamp; struct spi_controller; struct spi_transfer; struct spi_controller_mem_ops; struct spi_controller_mem_caps; struct spi_message; struct spi_offload; struct spi_offload_config; /* * INTERFACES between SPI controller-side drivers and SPI target protocol handlers, * and SPI infrastructure. */ extern const struct bus_type spi_bus_type; /** * struct spi_statistics - statistics for spi transfers * @syncp: seqcount to protect members in this struct for per-cpu update * on 32-bit systems * * @messages: number of spi-messages handled * @transfers: number of spi_transfers handled * @errors: number of errors during spi_transfer * @timedout: number of timeouts during spi_transfer * * @spi_sync: number of times spi_sync is used * @spi_sync_immediate: * number of times spi_sync is executed immediately * in calling context without queuing and scheduling * @spi_async: number of times spi_async is used * * @bytes: number of bytes transferred to/from device * @bytes_tx: number of bytes sent to device * @bytes_rx: number of bytes received from device * * @transfer_bytes_histo: * transfer bytes histogram * * @transfers_split_maxsize: * number of transfers that have been split because of * maxsize limit */ struct spi_statistics { struct u64_stats_sync syncp; u64_stats_t messages; u64_stats_t transfers; u64_stats_t errors; u64_stats_t timedout; u64_stats_t spi_sync; u64_stats_t spi_sync_immediate; u64_stats_t spi_async; u64_stats_t bytes; u64_stats_t bytes_rx; u64_stats_t bytes_tx; #define SPI_STATISTICS_HISTO_SIZE 17 u64_stats_t transfer_bytes_histo[SPI_STATISTICS_HISTO_SIZE]; u64_stats_t transfers_split_maxsize; }; #define SPI_STATISTICS_ADD_TO_FIELD(pcpu_stats, field, count) \ do { \ struct spi_statistics *__lstats; \ get_cpu(); \ __lstats = this_cpu_ptr(pcpu_stats); \ u64_stats_update_begin(&__lstats->syncp); \ u64_stats_add(&__lstats->field, count); \ u64_stats_update_end(&__lstats->syncp); \ put_cpu(); \ } while (0) #define SPI_STATISTICS_INCREMENT_FIELD(pcpu_stats, field) \ do { \ struct spi_statistics *__lstats; \ get_cpu(); \ __lstats = this_cpu_ptr(pcpu_stats); \ u64_stats_update_begin(&__lstats->syncp); \ u64_stats_inc(&__lstats->field); \ u64_stats_update_end(&__lstats->syncp); \ put_cpu(); \ } while (0) /** * struct spi_delay - SPI delay information * @value: Value for the delay * @unit: Unit for the delay */ struct spi_delay { #define SPI_DELAY_UNIT_USECS 0 #define SPI_DELAY_UNIT_NSECS 1 #define SPI_DELAY_UNIT_SCK 2 u16 value; u8 unit; }; extern int spi_delay_to_ns(struct spi_delay *_delay, struct spi_transfer *xfer); extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer); extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, struct spi_transfer *xfer); /** * struct spi_device - Controller side proxy for an SPI target device * @dev: Driver model representation of the device. * @controller: SPI controller used with the device. * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. * @bits_per_word: Data transfers involve one or more words; word sizes * like eight or 12 bits are common. In-memory wordsizes are * powers of two bytes (e.g. 20 bit samples use 32 bits). * This may be changed by the device's driver, or left at the * default (0) indicating protocol words are eight bit bytes. * The spi_transfer.bits_per_word can override this for each transfer. * @rt: Make the pump thread real time priority. * @mode: The spi mode defines how data is clocked out and in. * This may be changed by the device's driver. * The "active low" default for chipselect mode can be overridden * (by specifying SPI_CS_HIGH) as can the "MSB first" default for * each word in a transfer (by specifying SPI_LSB_FIRST). * @irq: Negative, or the number passed to request_irq() to receive * interrupts from this device. * @controller_state: Controller's runtime state * @controller_data: Board-specific definitions for controller, such as * FIFO initialization parameters; from board_info.controller_data * @modalias: Name of the driver to use with this device, or an alias * for that name. This appears in the sysfs "modalias" attribute * for driver coldplugging, and in uevents used for hotplugging * @driver_override: If the name of a driver is written to this attribute, then * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. * @pcpu_statistics: statistics for the spi_device * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted * @cs_hold: delay to be introduced by the controller before CS is deasserted * @cs_inactive: delay to be introduced by the controller after CS is * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. * @chip_select: Array of physical chipselect, spi->chipselect[i] gives * the corresponding physical CS for logical CS i. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines * (optional, NULL when not using a GPIO line) * * A @spi_device is used to interchange data between an SPI target device * (usually a discrete chip) and CPU memory. * * In @dev, the platform_data is used to hold information about this * device that's meaningful to the device's protocol driver, but not * to its controller. One example might be an identifier for a chip * variant with slightly different functionality; another might be * information about how this particular board wires the chip's pins. */ struct spi_device { struct device dev; struct spi_controller *controller; u32 max_speed_hz; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ #define SPI_NO_RX BIT(30) /* No receive wire */ /* * TPM specification defines flow control over SPI. Client device * can insert a wait state on MISO when address is transmitted by * controller on MOSI. Detecting the wait state in software is only * possible for full duplex controllers. For controllers that support * only half-duplex, the wait state detection needs to be implemented * in hardware. TPM devices would set this flag when hardware flow * control is expected from SPI controller. */ #define SPI_TPM_HW_FLOW BIT(29) /* TPM HW flow control */ /* * All bits defined above should be covered by SPI_MODE_KERNEL_MASK. * The SPI_MODE_KERNEL_MASK has the SPI_MODE_USER_MASK counterpart, * which is defined in 'include/uapi/linux/spi/spi.h'. * The bits defined here are from bit 31 downwards, while in * SPI_MODE_USER_MASK are from 0 upwards. * These bits must not overlap. A static assert check should make sure of that. * If adding extra bits, make sure to decrease the bit index below as well. */ #define SPI_MODE_KERNEL_MASK (~(BIT(29) - 1)) u32 mode; int irq; void *controller_state; void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; /* The statistics */ struct spi_statistics __percpu *pcpu_statistics; struct spi_delay word_delay; /* Inter-word delay */ /* CS delays */ struct spi_delay cs_setup; struct spi_delay cs_hold; struct spi_delay cs_inactive; u8 chip_select[SPI_CS_CNT_MAX]; /* * Bit mask of the chipselect(s) that the driver need to use from * the chipselect array. When the controller is capable to handle * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ u32 cs_index_mask : SPI_CS_CNT_MAX; struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: * - memory packing (12 bit samples into low bits, others zeroed) * - priority * - chipselect delays * - ... */ }; /* Make sure that SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK don't overlap */ static_assert((SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK) == 0, "SPI_MODE_USER_MASK & SPI_MODE_KERNEL_MASK must not overlap"); #define to_spi_device(__dev) container_of_const(__dev, struct spi_device, dev) /* Most drivers won't need to care about device refcounting */ static inline struct spi_device *spi_dev_get(struct spi_device *spi) { return (spi && get_device(&spi->dev)) ? spi : NULL; } static inline void spi_dev_put(struct spi_device *spi) { if (spi) put_device(&spi->dev); } /* ctldata is for the bus_controller driver's runtime state */ static inline void *spi_get_ctldata(const struct spi_device *spi) { return spi->controller_state; } static inline void spi_set_ctldata(struct spi_device *spi, void *state) { spi->controller_state = state; } /* Device driver data */ static inline void spi_set_drvdata(struct spi_device *spi, void *data) { dev_set_drvdata(&spi->dev, data); } static inline void *spi_get_drvdata(const struct spi_device *spi) { return dev_get_drvdata(&spi->dev); } static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx) { return spi->chip_select[idx]; } static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect) { spi->chip_select[idx] = chipselect; } static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx) { return spi->cs_gpiod[idx]; } static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod) { spi->cs_gpiod[idx] = csgpiod; } static inline bool spi_is_csgpiod(struct spi_device *spi) { u8 idx; for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { if (spi_get_csgpiod(spi, idx)) return true; } return false; } /** * struct spi_driver - Host side "protocol" driver * @id_table: List of SPI devices supported by this driver * @probe: Binds this driver to the SPI device. Drivers can verify * that the device is actually present, and may need to configure * characteristics (such as bits_per_word) which weren't needed for * the initial configuration done during system setup. * @remove: Unbinds this driver from the SPI device * @shutdown: Standard shutdown callback used during system state * transitions such as powerdown/halt and kexec * @driver: SPI device drivers should initialize the name and owner * field of this structure. * * This represents the kind of device driver that uses SPI messages to * interact with the hardware at the other end of a SPI link. It's called * a "protocol" driver because it works through messages rather than talking * directly to SPI hardware (which is what the underlying SPI controller * driver does to pass those messages). These protocols are defined in the * specification for the device(s) supported by the driver. * * As a rule, those device protocols represent the lowest level interface * supported by a driver, and it will support upper level interfaces too. * Examples of such upper levels include frameworks like MTD, networking, * MMC, RTC, filesystem character device nodes, and hardware monitoring. */ struct spi_driver { const struct spi_device_id *id_table; int (*probe)(struct spi_device *spi); void (*remove)(struct spi_device *spi); void (*shutdown)(struct spi_device *spi); struct device_driver driver; }; #define to_spi_driver(__drv) \ ( __drv ? container_of_const(__drv, struct spi_driver, driver) : NULL ) extern int __spi_register_driver(struct module *owner, struct spi_driver *sdrv); /** * spi_unregister_driver - reverse effect of spi_register_driver * @sdrv: the driver to unregister * Context: can sleep */ static inline void spi_unregister_driver(struct spi_driver *sdrv) { if (sdrv) driver_unregister(&sdrv->driver); } extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); /* Use a define to avoid include chaining to get THIS_MODULE */ #define spi_register_driver(driver) \ __spi_register_driver(THIS_MODULE, driver) /** * module_spi_driver() - Helper macro for registering a SPI driver * @__spi_driver: spi_driver struct * * Helper macro for SPI drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_spi_driver(__spi_driver) \ module_driver(__spi_driver, spi_register_driver, \ spi_unregister_driver) /** * struct spi_controller - interface to SPI host or target controller * @dev: device interface to this driver * @list: link with the global spi_controller list * @bus_num: board-specific (and often SOC-specific) identifier for a * given SPI controller. * @num_chipselect: chipselects are used to distinguish individual * SPI targets, and are numbered from zero to num_chipselects. * each target has a chipselect signal, but it's common that not * every chipselect is connected to a target. * @dma_alignment: SPI controller constraint on DMA buffers alignment. * @mode_bits: flags understood by this controller driver * @buswidth_override_bits: flags to override for this controller driver * @bits_per_word_mask: A mask indicating which values of bits_per_word are * supported by the driver. Bit n indicates that a bits_per_word n+1 is * supported. If set, the SPI core will reject any transfer with an * unsupported bits_per_word. If not set, this value is simply ignored, * and it's up to the individual driver to perform any validation. * @min_speed_hz: Lowest supported transfer speed * @max_speed_hz: Highest supported transfer speed * @flags: other constraints relevant to this driver * @slave: indicates that this is an SPI slave controller * @target: indicates that this is an SPI target controller * @devm_allocated: whether the allocation of this struct is devres-managed * @max_transfer_size: function that returns the max transfer size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. * @max_message_size: function that returns the max message size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. * @io_mutex: mutex for physical bus access * @add_lock: mutex to avoid adding devices to the same chipselect * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use * @setup: updates the device mode and clocking records used by a * device's SPI controller; protocol code may call this. This * must fail if an unrecognized or unsupported mode is requested. * It's always safe to call this unless transfers are pending on * the device whose settings are being modified. * @set_cs_timing: optional hook for SPI devices to request SPI * controller for configuring specific CS setup time, hold time and inactive * delay in terms of clock counts * @transfer: adds a message to the controller's transfer queue. * @cleanup: frees controller-specific state * @can_dma: determine whether this controller supports DMA * @dma_map_dev: device which can be used for DMA mapping * @cur_rx_dma_dev: device which is currently used for RX DMA mapping * @cur_tx_dma_dev: device which is currently used for TX DMA mapping * @queued: whether this controller is providing an internal message queue * @kworker: pointer to thread struct for message pump * @pump_messages: work struct for scheduling work to the message pump * @queue_lock: spinlock to synchronise access to message queue * @queue: message queue * @cur_msg: the currently in-flight message * @cur_msg_completion: a completion for the current in-flight message * @cur_msg_incomplete: Flag used internally to opportunistically skip * the @cur_msg_completion. This flag is used to check if the driver has * already called spi_finalize_current_message(). * @cur_msg_need_completion: Flag used internally to opportunistically skip * the @cur_msg_completion. This flag is used to signal the context that * is running spi_finalize_current_message() that it needs to complete() * @fallback: fallback to PIO if DMA transfer return failure with * SPI_TRANS_FAIL_NO_START. * @last_cs_mode_high: was (mode & SPI_CS_HIGH) true on the last call to set_cs. * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip * selected * @last_cs_index_mask: bit mask the last chip selects that were used * @xfer_completion: used by core transfer_one_message() * @busy: message pump is busy * @running: message pump is running * @rt: whether this queue is set to run as a realtime task * @auto_runtime_pm: the core should ensure a runtime PM reference is held * while the hardware is prepared, using the parent * device for the spidev * @max_dma_len: Maximum length of a DMA transfer for the device. * @prepare_transfer_hardware: a message will soon arrive from the queue * so the subsystem requests the driver to prepare the transfer hardware * by issuing this call * @transfer_one_message: the subsystem calls the driver to transfer a single * message while queuing transfers that arrive in the meantime. When the * driver is finished with this message, it must call * spi_finalize_current_message() so the subsystem can issue the next * message * @unprepare_transfer_hardware: there are currently no more messages on the * queue so the subsystem notifies the driver that it may relax the * hardware by issuing this call * * @set_cs: set the logic level of the chip select line. May be called * from interrupt context. * @optimize_message: optimize the message for reuse * @unoptimize_message: release resources allocated by optimize_message * @prepare_message: set up the controller to transfer a single message, * for example doing DMA mapping. Called from threaded * context. * @transfer_one: transfer a single spi_transfer. * * - return 0 if the transfer is finished, * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem * can issue the next transfer. If the transfer fails, the * driver must set the flag SPI_TRANS_FAIL_IO to * spi_transfer->error first, before calling * spi_finalize_current_transfer(). * Note: transfer_one and transfer_one_message are mutually * exclusive; when both are set, the generic subsystem does * not call your transfer_one callback. * @handle_err: the subsystem calls the driver to handle an error that occurs * in the generic implementation of transfer_one_message(). * @mem_ops: optimized/dedicated operations for interactions with SPI memory. * This field is optional and should only be implemented if the * controller has native support for memory like operations. * @get_offload: callback for controllers with offload support to get matching * offload instance. Implementations should return -ENODEV if no match is * found. * @put_offload: release the offload instance acquired by @get_offload. * @mem_caps: controller capabilities for the handling of memory operations. * @dtr_caps: true if controller has dtr(single/dual transfer rate) capability. * QSPI based controller should fill this based on controller's capability. * @unprepare_message: undo any work done by prepare_message(). * @target_abort: abort the ongoing transfer request on an SPI target controller * @cs_gpiods: Array of GPIO descriptors to use as chip select lines; one per CS * number. Any individual value may be NULL for CS lines that * are not GPIOs (driven by the SPI controller itself). * @use_gpio_descriptors: Turns on the code in the SPI core to parse and grab * GPIO descriptors. This will fill in @cs_gpiods and SPI devices will have * the cs_gpiod assigned if a GPIO line is found for the chipselect. * @unused_native_cs: When cs_gpiods is used, spi_register_controller() will * fill in this field with the first unused native CS, to be used by SPI * controller drivers that need to drive a native CS when using GPIO CS. * @max_native_cs: When cs_gpiods is used, and this field is filled in, * spi_register_controller() will validate all native CS (including the * unused native CS) against this value. * @pcpu_statistics: statistics for the spi_controller * @dma_tx: DMA transmit channel * @dma_rx: DMA receive channel * @dummy_rx: dummy receive buffer for full-duplex devices * @dummy_tx: dummy transmit buffer for full-duplex devices * @fw_translate_cs: If the boot firmware uses different numbering scheme * what Linux expects, this optional hook can be used to translate * between the two. * @ptp_sts_supported: If the driver sets this to true, it must provide a * time snapshot in @spi_transfer->ptp_sts as close as possible to the * moment in time when @spi_transfer->ptp_sts_word_pre and * @spi_transfer->ptp_sts_word_post were transmitted. * If the driver does not set this, the SPI core takes the snapshot as * close to the driver hand-over as possible. * @irq_flags: Interrupt enable state during PTP system timestamping * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. * @must_async: disable all fast paths in the core * @defer_optimize_message: set to true if controller cannot pre-optimize messages * and needs to defer the optimization step until the message is actually * being transferred * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals * but not chip select signals. Each device may be configured to use a * different clock rate, since those shared signals are ignored unless * the chip is selected. * * The driver for an SPI controller manages access to those devices through * a queue of spi_message transactions, copying data between CPU memory and * an SPI target device. For each such message it queues, it calls the * message's completion function when the transaction completes. */ struct spi_controller { struct device dev; struct list_head list; /* * Other than negative (== assign one dynamically), bus_num is fully * board-specific. Usually that simplifies to being SoC-specific. * example: one SoC has three SPI controllers, numbered 0..2, * and one board's schematics might show it using SPI-2. Software * would normally use bus_num=2 for that controller. */ s16 bus_num; /* * Chipselects will be integral to many controllers; some others * might use board-specific GPIOs. */ u16 num_chipselect; /* Some SPI controllers pose alignment requirements on DMAable * buffers; let protocol drivers know about these requirements. */ u16 dma_alignment; /* spi_device.mode flags understood by this controller driver */ u32 mode_bits; /* spi_device.mode flags override flags for this controller */ u32 buswidth_override_bits; /* Bitmask of supported bits_per_word for transfers */ u32 bits_per_word_mask; #define SPI_BPW_MASK(bits) BIT((bits) - 1) #define SPI_BPW_RANGE_MASK(min, max) GENMASK((max) - 1, (min) - 1) /* Limits on transfer speed */ u32 min_speed_hz; u32 max_speed_hz; /* Other constraints relevant to this driver */ u16 flags; #define SPI_CONTROLLER_HALF_DUPLEX BIT(0) /* Can't do full duplex */ #define SPI_CONTROLLER_NO_RX BIT(1) /* Can't do buffer read */ #define SPI_CONTROLLER_NO_TX BIT(2) /* Can't do buffer write */ #define SPI_CONTROLLER_MUST_RX BIT(3) /* Requires rx */ #define SPI_CONTROLLER_MUST_TX BIT(4) /* Requires tx */ #define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select target device */ #define SPI_CONTROLLER_SUSPENDED BIT(6) /* Currently suspended */ /* * The spi-controller has multi chip select capability and can * assert/de-assert more than one chip select at once. */ #define SPI_CONTROLLER_MULTI_CS BIT(7) /* Flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; union { /* Flag indicating this is an SPI slave controller */ bool slave; /* Flag indicating this is an SPI target controller */ bool target; }; /* * On some hardware transfer / message size may be constrained * the limit may depend on device transfer settings. */ size_t (*max_transfer_size)(struct spi_device *spi); size_t (*max_message_size)(struct spi_device *spi); /* I/O mutex */ struct mutex io_mutex; /* Used to avoid adding the same CS twice */ struct mutex add_lock; /* Lock and mutex for SPI bus locking */ spinlock_t bus_lock_spinlock; struct mutex bus_lock_mutex; /* Flag indicating that the SPI bus is locked for exclusive use */ bool bus_lock_flag; /* * Setup mode and clock, etc (SPI driver may call many times). * * IMPORTANT: this may be called when transfers to another * device are active. DO NOT UPDATE SHARED REGISTERS in ways * which could break those transfers. */ int (*setup)(struct spi_device *spi); /* * set_cs_timing() method is for SPI controllers that supports * configuring CS timing. * * This hook allows SPI client drivers to request SPI controllers * to configure specific CS timing through spi_set_cs_timing() after * spi_setup(). */ int (*set_cs_timing)(struct spi_device *spi); /* * Bidirectional bulk transfers * * + The transfer() method may not sleep; its main role is * just to add the message to the queue. * + For now there's no remove-from-queue operation, or * any other request management * + To a given spi_device, message queueing is pure FIFO * * + The controller's main job is to process its message queue, * selecting a chip (for controllers), then transferring data * + If there are multiple spi_device children, the i/o queue * arbitration algorithm is unspecified (round robin, FIFO, * priority, reservations, preemption, etc) * * + Chipselect stays active during the entire message * (unless modified by spi_transfer.cs_change != 0). * + The message transfers use clock and SPI mode parameters * previously established by setup() for this device */ int (*transfer)(struct spi_device *spi, struct spi_message *mesg); /* Called on release() to free memory provided by spi_controller */ void (*cleanup)(struct spi_device *spi); /* * Used to enable core support for DMA handling, if can_dma() * exists and returns true then the transfer will be mapped * prior to transfer_one() being called. The driver should * not modify or store xfer and dma_tx and dma_rx must be set * while the device is prepared. */ bool (*can_dma)(struct spi_controller *ctlr, struct spi_device *spi, struct spi_transfer *xfer); struct device *dma_map_dev; struct device *cur_rx_dma_dev; struct device *cur_tx_dma_dev; /* * These hooks are for drivers that want to use the generic * controller transfer queueing mechanism. If these are used, the * transfer() function above must NOT be specified by the driver. * Over time we expect SPI drivers to be phased over to this API. */ bool queued; struct kthread_worker *kworker; struct kthread_work pump_messages; spinlock_t queue_lock; struct list_head queue; struct spi_message *cur_msg; struct completion cur_msg_completion; bool cur_msg_incomplete; bool cur_msg_need_completion; bool busy; bool running; bool rt; bool auto_runtime_pm; bool fallback; bool last_cs_mode_high; s8 last_cs[SPI_CS_CNT_MAX]; u32 last_cs_index_mask : SPI_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; int (*optimize_message)(struct spi_message *msg); int (*unoptimize_message)(struct spi_message *msg); int (*prepare_transfer_hardware)(struct spi_controller *ctlr); int (*transfer_one_message)(struct spi_controller *ctlr, struct spi_message *mesg); int (*unprepare_transfer_hardware)(struct spi_controller *ctlr); int (*prepare_message)(struct spi_controller *ctlr, struct spi_message *message); int (*unprepare_message)(struct spi_controller *ctlr, struct spi_message *message); int (*target_abort)(struct spi_controller *ctlr); /* * These hooks are for drivers that use a generic implementation * of transfer_one_message() provided by the core. */ void (*set_cs)(struct spi_device *spi, bool enable); int (*transfer_one)(struct spi_controller *ctlr, struct spi_device *spi, struct spi_transfer *transfer); void (*handle_err)(struct spi_controller *ctlr, struct spi_message *message); /* Optimized handlers for SPI memory-like operations. */ const struct spi_controller_mem_ops *mem_ops; const struct spi_controller_mem_caps *mem_caps; /* SPI or QSPI controller can set to true if supports SDR/DDR transfer rate */ bool dtr_caps; struct spi_offload *(*get_offload)(struct spi_device *spi, const struct spi_offload_config *config); void (*put_offload)(struct spi_offload *offload); /* GPIO chip select */ struct gpio_desc **cs_gpiods; bool use_gpio_descriptors; s8 unused_native_cs; s8 max_native_cs; /* Statistics */ struct spi_statistics __percpu *pcpu_statistics; /* DMA channels for use with core dmaengine helpers */ struct dma_chan *dma_tx; struct dma_chan *dma_rx; /* Dummy data for full duplex devices */ void *dummy_rx; void *dummy_tx; int (*fw_translate_cs)(struct spi_controller *ctlr, unsigned cs); /* * Driver sets this field to indicate it is able to snapshot SPI * transfers (needed e.g. for reading the time of POSIX clocks) */ bool ptp_sts_supported; /* Interrupt enable state during PTP system timestamping */ unsigned long irq_flags; /* Flag for enabling opportunistic skipping of the queue in spi_sync */ bool queue_empty; bool must_async; bool defer_optimize_message; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) { return dev_get_drvdata(&ctlr->dev); } static inline void spi_controller_set_devdata(struct spi_controller *ctlr, void *data) { dev_set_drvdata(&ctlr->dev, data); } static inline struct spi_controller *spi_controller_get(struct spi_controller *ctlr) { if (!ctlr || !get_device(&ctlr->dev)) return NULL; return ctlr; } static inline void spi_controller_put(struct spi_controller *ctlr) { if (ctlr) put_device(&ctlr->dev); } static inline bool spi_controller_is_target(struct spi_controller *ctlr) { return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->target; } /* PM calls that need to be issued by the driver */ extern int spi_controller_suspend(struct spi_controller *ctlr); extern int spi_controller_resume(struct spi_controller *ctlr); /* Calls the driver make to interact with the message queue */ extern struct spi_message *spi_get_next_queued_message(struct spi_controller *ctlr); extern void spi_finalize_current_message(struct spi_controller *ctlr); extern void spi_finalize_current_transfer(struct spi_controller *ctlr); /* Helper calls for driver to timestamp transfer */ void spi_take_timestamp_pre(struct spi_controller *ctlr, struct spi_transfer *xfer, size_t progress, bool irqs_off); void spi_take_timestamp_post(struct spi_controller *ctlr, struct spi_transfer *xfer, size_t progress, bool irqs_off); /* The SPI driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, unsigned int size, bool target); static inline struct spi_controller *spi_alloc_host(struct device *dev, unsigned int size) { return __spi_alloc_controller(dev, size, false); } static inline struct spi_controller *spi_alloc_target(struct device *dev, unsigned int size) { if (!IS_ENABLED(CONFIG_SPI_SLAVE)) return NULL; return __spi_alloc_controller(dev, size, true); } struct spi_controller *__devm_spi_alloc_controller(struct device *dev, unsigned int size, bool target); static inline struct spi_controller *devm_spi_alloc_host(struct device *dev, unsigned int size) { return __devm_spi_alloc_controller(dev, size, false); } static inline struct spi_controller *devm_spi_alloc_target(struct device *dev, unsigned int size) { if (!IS_ENABLED(CONFIG_SPI_SLAVE)) return NULL; return __devm_spi_alloc_controller(dev, size, true); } extern int spi_register_controller(struct spi_controller *ctlr); extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); extern void spi_unregister_controller(struct spi_controller *ctlr); #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SPI_MASTER) extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev); extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index); int acpi_spi_count_resources(struct acpi_device *adev); #else static inline struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) { return NULL; } static inline struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index) { return ERR_PTR(-ENODEV); } static inline int acpi_spi_count_resources(struct acpi_device *adev) { return 0; } #endif /* * SPI resource management while processing a SPI message */ typedef void (*spi_res_release_t)(struct spi_controller *ctlr, struct spi_message *msg, void *res); /** * struct spi_res - SPI resource management structure * @entry: list entry * @release: release code called prior to freeing this resource * @data: extra data allocated for the specific use-case * * This is based on ideas from devres, but focused on life-cycle * management during spi_message processing. */ struct spi_res { struct list_head entry; spi_res_release_t release; unsigned long long data[]; /* Guarantee ull alignment */ }; /*---------------------------------------------------------------------------*/ /* * I/O INTERFACE between SPI controller and protocol drivers * * Protocol drivers use a queue of spi_messages, each transferring data * between the controller and memory buffers. * * The spi_messages themselves consist of a series of read+write transfer * segments. Those segments always read the same number of bits as they * write; but one or the other is easily ignored by passing a NULL buffer * pointer. (This is unlike most types of I/O API, because SPI hardware * is full duplex.) * * NOTE: Allocation of spi_transfer and spi_message memory is entirely * up to the protocol driver, which guarantees the integrity of both (as * well as the data buffers) for as long as the message is queued. */ /** * struct spi_transfer - a read/write buffer pair * @tx_buf: data to be written (DMA-safe memory), or NULL * @rx_buf: data to be read (DMA-safe memory), or NULL * @tx_dma: DMA address of tx_buf, currently not for client use * @rx_dma: DMA address of rx_buf, currently not for client use * @tx_nbits: number of bits used for writing. If 0 the default * (SPI_NBITS_SINGLE) is used. * @rx_nbits: number of bits used for reading. If 0 the default * (SPI_NBITS_SINGLE) is used. * @len: size of rx and tx buffers (in bytes) * @speed_hz: Select a speed other than the device default for this * transfer. If 0 the default (from @spi_device) is used. * @bits_per_word: select a bits_per_word other than the device default * for this transfer. If 0 the default (from @spi_device) is used. * @dummy_data: indicates transfer is dummy bytes transfer. * @cs_off: performs the transfer with chipselect off. * @cs_change: affects chipselect after this transfer completes * @cs_change_delay: delay between cs deassert and assert when * @cs_change is set and @spi_transfer is not the last in @spi_message * @delay: delay to be introduced after this transfer before * (optionally) changing the chipselect status, then starting * the next transfer or completing this @spi_message. * @word_delay: inter word delay to be introduced after each word size * (set by bits_per_word) transmission. * @effective_speed_hz: the effective SCK-speed that was used to * transfer this transfer. Set to 0 if the SPI bus driver does * not support it. * @transfer_list: transfers are sequenced through @spi_message.transfers * @tx_sg_mapped: If true, the @tx_sg is mapped for DMA * @rx_sg_mapped: If true, the @rx_sg is mapped for DMA * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use * @offload_flags: Flags that are only applicable to specialized SPI offload * transfers. See %SPI_OFFLOAD_XFER_* in spi-offload.h. * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset * within @tx_buf for which the SPI device is requesting that the time * snapshot for this transfer begins. Upon completing the SPI transfer, * this value may have changed compared to what was requested, depending * on the available snapshotting resolution (DMA transfer, * @ptp_sts_supported is false, etc). * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning * that a single byte should be snapshotted). * If the core takes care of the timestamp (if @ptp_sts_supported is false * for this controller), it will set @ptp_sts_word_pre to 0, and * @ptp_sts_word_post to the length of the transfer. This is done * purposefully (instead of setting to spi_transfer->len - 1) to denote * that a transfer-level snapshot taken from within the driver may still * be of higher quality. * @ptp_sts: Pointer to a memory location held by the SPI target device where a * PTP system timestamp structure may lie. If drivers use PIO or their * hardware has some sort of assist for retrieving exact transfer timing, * they can (and should) assert @ptp_sts_supported and populate this * structure using the ptp_read_system_*ts helper functions. * The timestamp must represent the time at which the SPI target device has * processed the word, i.e. the "pre" timestamp should be taken before * transmitting the "pre" word, and the "post" timestamp after receiving * transmit confirmation from the controller for the "post" word. * @dtr_mode: true if supports double transfer rate. * @timestamped: true if the transfer has been timestamped * @error: Error status logged by SPI controller driver. * * SPI transfers always write the same number of bytes as they read. * Protocol drivers should always provide @rx_buf and/or @tx_buf. * In some cases, they may also want to provide DMA addresses for * the data being transferred; that may reduce overhead, when the * underlying driver uses DMA. * * If the transmit buffer is NULL, zeroes will be shifted out * while filling @rx_buf. If the receive buffer is NULL, the data * shifted in will be discarded. Only "len" bytes shift out (or in). * It's an error to try to shift out a partial word. (For example, by * shifting out three bytes with word size of sixteen or twenty bits; * the former uses two bytes per word, the latter uses four bytes.) * * In-memory data values are always in native CPU byte order, translated * from the wire byte order (big-endian except with SPI_LSB_FIRST). So * for example when bits_per_word is sixteen, buffers are 2N bytes long * (@len = 2N) and hold N sixteen bit words in CPU byte order. * * When the word size of the SPI transfer is not a power-of-two multiple * of eight bits, those in-memory words include extra bits. In-memory * words are always seen by protocol drivers as right-justified, so the * undefined (rx) or unused (tx) bits are always the most significant bits. * * All SPI transfers start with the relevant chipselect active. Normally * it stays selected until after the last transfer in a message. Drivers * can affect the chipselect signal using cs_change. * * (i) If the transfer isn't the last one in the message, this flag is * used to make the chipselect briefly go inactive in the middle of the * message. Toggling chipselect in this way may be needed to terminate * a chip command, letting a single spi_message perform all of group of * chip transactions together. * * (ii) When the transfer is the last one in the message, the chip may * stay selected until the next transfer. On multi-device SPI busses * with nothing blocking messages going to other devices, this is just * a performance hint; starting a message to another device deselects * this one. But in other cases, this can be used to ensure correctness. * Some devices need protocol transactions to be built from a series of * spi_message submissions, where the content of one message is determined * by the results of previous messages and where the whole transaction * ends when the chipselect goes inactive. * * When SPI can transfer in 1x,2x or 4x. It can get this transfer information * from device through @tx_nbits and @rx_nbits. In Bi-direction, these * two should both be set. User can set transfer mode with SPI_NBITS_SINGLE(1x) * SPI_NBITS_DUAL(2x) and SPI_NBITS_QUAD(4x) to support these three transfer. * * User may also set dtr_mode to true to use dual transfer mode if desired. if * not, default considered as single transfer mode. * * The code that submits an spi_message (and its spi_transfers) * to the lower layers is responsible for managing its memory. * Zero-initialize every field you don't set up explicitly, to * insulate against future API updates. After you submit a message * and its transfers, ignore them until its completion callback. */ struct spi_transfer { /* * It's okay if tx_buf == rx_buf (right?). * For MicroWire, one buffer must be NULL. * Buffers must work with dma_*map_single() calls. */ const void *tx_buf; void *rx_buf; unsigned len; #define SPI_TRANS_FAIL_NO_START BIT(0) #define SPI_TRANS_FAIL_IO BIT(1) u16 error; bool tx_sg_mapped; bool rx_sg_mapped; struct sg_table tx_sg; struct sg_table rx_sg; dma_addr_t tx_dma; dma_addr_t rx_dma; unsigned dummy_data:1; unsigned cs_off:1; unsigned cs_change:1; unsigned tx_nbits:4; unsigned rx_nbits:4; unsigned timestamped:1; bool dtr_mode; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ #define SPI_NBITS_OCTAL 0x08 /* 8-bit transfer */ u8 bits_per_word; struct spi_delay delay; struct spi_delay cs_change_delay; struct spi_delay word_delay; u32 speed_hz; u32 effective_speed_hz; /* Use %SPI_OFFLOAD_XFER_* from spi-offload.h */ unsigned int offload_flags; unsigned int ptp_sts_word_pre; unsigned int ptp_sts_word_post; struct ptp_system_timestamp *ptp_sts; struct list_head transfer_list; }; /** * struct spi_message - one multi-segment SPI transaction * @transfers: list of transfer segments in this transaction * @spi: SPI device to which the transaction is queued * @pre_optimized: peripheral driver pre-optimized the message * @optimized: the message is in the optimized state * @prepared: spi_prepare_message was called for the this message * @status: zero for success, else negative errno * @complete: called to report transaction completions * @context: the argument to complete() when it's called * @frame_length: the total number of bytes in the message * @actual_length: the total number of bytes that were transferred in all * successful segments * @queue: for use by whichever driver currently owns the message * @state: for use by whichever driver currently owns the message * @opt_state: for use by whichever driver currently owns the message * @resources: for resource management when the SPI message is processed * @offload: (optional) offload instance used by this message * * A @spi_message is used to execute an atomic sequence of data transfers, * each represented by a struct spi_transfer. The sequence is "atomic" * in the sense that no other spi_message may use that SPI bus until that * sequence completes. On some systems, many such sequences can execute as * a single programmed DMA transfer. On all systems, these messages are * queued, and might complete after transactions to other devices. Messages * sent to a given spi_device are always executed in FIFO order. * * The code that submits an spi_message (and its spi_transfers) * to the lower layers is responsible for managing its memory. * Zero-initialize every field you don't set up explicitly, to * insulate against future API updates. After you submit a message * and its transfers, ignore them until its completion callback. */ struct spi_message { struct list_head transfers; struct spi_device *spi; /* spi_optimize_message() was called for this message */ bool pre_optimized; /* __spi_optimize_message() was called for this message */ bool optimized; /* spi_prepare_message() was called for this message */ bool prepared; /* * REVISIT: we might want a flag affecting the behavior of the * last transfer ... allowing things like "read 16 bit length L" * immediately followed by "read L bytes". Basically imposing * a specific message scheduling algorithm. * * Some controller drivers (message-at-a-time queue processing) * could provide that as their default scheduling algorithm. But * others (with multi-message pipelines) could need a flag to * tell them about such special cases. */ /* Completion is reported through a callback */ int status; void (*complete)(void *context); void *context; unsigned frame_length; unsigned actual_length; /* * For optional use by whatever driver currently owns the * spi_message ... between calls to spi_async and then later * complete(), that's the spi_controller controller driver. */ struct list_head queue; void *state; /* * Optional state for use by controller driver between calls to * __spi_optimize_message() and __spi_unoptimize_message(). */ void *opt_state; /* * Optional offload instance used by this message. This must be set * by the peripheral driver before calling spi_optimize_message(). */ struct spi_offload *offload; /* List of spi_res resources when the SPI message is processed */ struct list_head resources; }; static inline void spi_message_init_no_memset(struct spi_message *m) { INIT_LIST_HEAD(&m->transfers); INIT_LIST_HEAD(&m->resources); } static inline void spi_message_init(struct spi_message *m) { memset(m, 0, sizeof *m); spi_message_init_no_memset(m); } static inline void spi_message_add_tail(struct spi_transfer *t, struct spi_message *m) { list_add_tail(&t->transfer_list, &m->transfers); } static inline void spi_transfer_del(struct spi_transfer *t) { list_del(&t->transfer_list); } static inline int spi_transfer_delay_exec(struct spi_transfer *t) { return spi_delay_exec(&t->delay, t); } /** * spi_message_init_with_transfers - Initialize spi_message and append transfers * @m: spi_message to be initialized * @xfers: An array of SPI transfers * @num_xfers: Number of items in the xfer array * * This function initializes the given spi_message and adds each spi_transfer in * the given array to the message. */ static inline void spi_message_init_with_transfers(struct spi_message *m, struct spi_transfer *xfers, unsigned int num_xfers) { unsigned int i; spi_message_init(m); for (i = 0; i < num_xfers; ++i) spi_message_add_tail(&xfers[i], m); } /* * It's fine to embed message and transaction structures in other data * structures so long as you don't free them while they're in use. */ static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags) { struct spi_message_with_transfers { struct spi_message m; struct spi_transfer t[]; } *mwt; unsigned i; mwt = kzalloc(struct_size(mwt, t, ntrans), flags); if (!mwt) return NULL; spi_message_init_no_memset(&mwt->m); for (i = 0; i < ntrans; i++) spi_message_add_tail(&mwt->t[i], &mwt->m); return &mwt->m; } static inline void spi_message_free(struct spi_message *m) { kfree(m); } extern int spi_optimize_message(struct spi_device *spi, struct spi_message *msg); extern void spi_unoptimize_message(struct spi_message *msg); extern int devm_spi_optimize_message(struct device *dev, struct spi_device *spi, struct spi_message *msg); extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); extern int spi_target_abort(struct spi_device *spi); static inline size_t spi_max_message_size(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; if (!ctlr->max_message_size) return SIZE_MAX; return ctlr->max_message_size(spi); } static inline size_t spi_max_transfer_size(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; size_t tr_max = SIZE_MAX; size_t msg_max = spi_max_message_size(spi); if (ctlr->max_transfer_size) tr_max = ctlr->max_transfer_size(spi); /* Transfer size limit must not be greater than message size limit */ return min(tr_max, msg_max); } /** * spi_is_bpw_supported - Check if bits per word is supported * @spi: SPI device * @bpw: Bits per word * * This function checks to see if the SPI controller supports @bpw. * * Returns: * True if @bpw is supported, false otherwise. */ static inline bool spi_is_bpw_supported(struct spi_device *spi, u32 bpw) { u32 bpw_mask = spi->controller->bits_per_word_mask; if (bpw == 8 || (bpw <= 32 && bpw_mask & SPI_BPW_MASK(bpw))) return true; return false; } /** * spi_bpw_to_bytes - Covert bits per word to bytes * @bpw: Bits per word * * This function converts the given @bpw to bytes. The result is always * power-of-two, e.g., * * =============== ================= * Input (in bits) Output (in bytes) * =============== ================= * 5 1 * 9 2 * 21 4 * 37 8 * =============== ================= * * It will return 0 for the 0 input. * * Returns: * Bytes for the given @bpw. */ static inline u32 spi_bpw_to_bytes(u32 bpw) { return roundup_pow_of_two(BITS_TO_BYTES(bpw)); } /** * spi_controller_xfer_timeout - Compute a suitable timeout value * @ctlr: SPI device * @xfer: Transfer descriptor * * Compute a relevant timeout value for the given transfer. We derive the time * that it would take on a single data line and take twice this amount of time * with a minimum of 500ms to avoid false positives on loaded systems. * * Returns: Transfer timeout value in milliseconds. */ static inline unsigned int spi_controller_xfer_timeout(struct spi_controller *ctlr, struct spi_transfer *xfer) { return max(xfer->len * 8 * 2 / (xfer->speed_hz / 1000), 500U); } /*---------------------------------------------------------------------------*/ /* SPI transfer replacement methods which make use of spi_res */ struct spi_replaced_transfers; typedef void (*spi_replaced_release_t)(struct spi_controller *ctlr, struct spi_message *msg, struct spi_replaced_transfers *res); /** * struct spi_replaced_transfers - structure describing the spi_transfer * replacements that have occurred * so that they can get reverted * @release: some extra release code to get executed prior to * releasing this structure * @extradata: pointer to some extra data if requested or NULL * @replaced_transfers: transfers that have been replaced and which need * to get restored * @replaced_after: the transfer after which the @replaced_transfers * are to get re-inserted * @inserted: number of transfers inserted * @inserted_transfers: array of spi_transfers of array-size @inserted, * that have been replacing replaced_transfers * * Note: that @extradata will point to @inserted_transfers[@inserted] * if some extra allocation is requested, so alignment will be the same * as for spi_transfers. */ struct spi_replaced_transfers { spi_replaced_release_t release; void *extradata; struct list_head replaced_transfers; struct list_head *replaced_after; size_t inserted; struct spi_transfer inserted_transfers[]; }; /*---------------------------------------------------------------------------*/ /* SPI transfer transformation methods */ extern int spi_split_transfers_maxsize(struct spi_controller *ctlr, struct spi_message *msg, size_t maxsize); extern int spi_split_transfers_maxwords(struct spi_controller *ctlr, struct spi_message *msg, size_t maxwords); /*---------------------------------------------------------------------------*/ /* * All these synchronous SPI transfer routines are utilities layered * over the core async transfer primitive. Here, "synchronous" means * they will sleep uninterruptibly until the async transfer completes. */ extern int spi_sync(struct spi_device *spi, struct spi_message *message); extern int spi_sync_locked(struct spi_device *spi, struct spi_message *message); extern int spi_bus_lock(struct spi_controller *ctlr); extern int spi_bus_unlock(struct spi_controller *ctlr); /** * spi_sync_transfer - synchronous SPI data transfer * @spi: device with which data will be exchanged * @xfers: An array of spi_transfers * @num_xfers: Number of items in the xfer array * Context: can sleep * * Does a synchronous SPI data transfer of the given spi_transfer array. * * For more specific semantics see spi_sync(). * * Return: zero on success, else a negative error code. */ static inline int spi_sync_transfer(struct spi_device *spi, struct spi_transfer *xfers, unsigned int num_xfers) { struct spi_message msg; spi_message_init_with_transfers(&msg, xfers, num_xfers); return spi_sync(spi, &msg); } /** * spi_write - SPI synchronous write * @spi: device to which data will be written * @buf: data buffer * @len: data buffer size * Context: can sleep * * This function writes the buffer @buf. * Callable only from contexts that can sleep. * * Return: zero on success, else a negative error code. */ static inline int spi_write(struct spi_device *spi, const void *buf, size_t len) { struct spi_transfer t = { .tx_buf = buf, .len = len, }; return spi_sync_transfer(spi, &t, 1); } /** * spi_read - SPI synchronous read * @spi: device from which data will be read * @buf: data buffer * @len: data buffer size * Context: can sleep * * This function reads the buffer @buf. * Callable only from contexts that can sleep. * * Return: zero on success, else a negative error code. */ static inline int spi_read(struct spi_device *spi, void *buf, size_t len) { struct spi_transfer t = { .rx_buf = buf, .len = len, }; return spi_sync_transfer(spi, &t, 1); } /* This copies txbuf and rxbuf data; for small transfers only! */ extern int spi_write_then_read(struct spi_device *spi, const void *txbuf, unsigned n_tx, void *rxbuf, unsigned n_rx); /** * spi_w8r8 - SPI synchronous 8 bit write followed by 8 bit read * @spi: device with which data will be exchanged * @cmd: command to be written before data is read back * Context: can sleep * * Callable only from contexts that can sleep. * * Return: the (unsigned) eight bit number returned by the * device, or else a negative error code. */ static inline ssize_t spi_w8r8(struct spi_device *spi, u8 cmd) { ssize_t status; u8 result; status = spi_write_then_read(spi, &cmd, 1, &result, 1); /* Return negative errno or unsigned value */ return (status < 0) ? status : result; } /** * spi_w8r16 - SPI synchronous 8 bit write followed by 16 bit read * @spi: device with which data will be exchanged * @cmd: command to be written before data is read back * Context: can sleep * * The number is returned in wire-order, which is at least sometimes * big-endian. * * Callable only from contexts that can sleep. * * Return: the (unsigned) sixteen bit number returned by the * device, or else a negative error code. */ static inline ssize_t spi_w8r16(struct spi_device *spi, u8 cmd) { ssize_t status; u16 result; status = spi_write_then_read(spi, &cmd, 1, &result, 2); /* Return negative errno or unsigned value */ return (status < 0) ? status : result; } /** * spi_w8r16be - SPI synchronous 8 bit write followed by 16 bit big-endian read * @spi: device with which data will be exchanged * @cmd: command to be written before data is read back * Context: can sleep * * This function is similar to spi_w8r16, with the exception that it will * convert the read 16 bit data word from big-endian to native endianness. * * Callable only from contexts that can sleep. * * Return: the (unsigned) sixteen bit number returned by the device in CPU * endianness, or else a negative error code. */ static inline ssize_t spi_w8r16be(struct spi_device *spi, u8 cmd) { ssize_t status; __be16 result; status = spi_write_then_read(spi, &cmd, 1, &result, 2); if (status < 0) return status; return be16_to_cpu(result); } /*---------------------------------------------------------------------------*/ /* * INTERFACE between board init code and SPI infrastructure. * * No SPI driver ever sees these SPI device table segments, but * it's how the SPI core (or adapters that get hotplugged) grows * the driver model tree. * * As a rule, SPI devices can't be probed. Instead, board init code * provides a table listing the devices which are present, with enough * information to bind and set up the device's driver. There's basic * support for non-static configurations too; enough to handle adding * parport adapters, or microcontrollers acting as USB-to-SPI bridges. */ /** * struct spi_board_info - board-specific template for a SPI device * @modalias: Initializes spi_device.modalias; identifies the driver. * @platform_data: Initializes spi_device.platform_data; the particular * data stored there is driver-specific. * @swnode: Software node for the device. * @controller_data: Initializes spi_device.controller_data; some * controllers need hints about hardware setup, e.g. for DMA. * @irq: Initializes spi_device.irq; depends on how the board is wired. * @max_speed_hz: Initializes spi_device.max_speed_hz; based on limits * from the chip datasheet and board-specific signal quality issues. * @bus_num: Identifies which spi_controller parents the spi_device; unused * by spi_new_device(), and otherwise depends on board wiring. * @chip_select: Initializes spi_device.chip_select; depends on how * the board is wired. * @mode: Initializes spi_device.mode; based on the chip datasheet, board * wiring (some devices support both 3WIRE and standard modes), and * possibly presence of an inverter in the chipselect path. * * When adding new SPI devices to the device tree, these structures serve * as a partial device template. They hold information which can't always * be determined by drivers. Information that probe() can establish (such * as the default transfer wordsize) is not included here. * * These structures are used in two places. Their primary role is to * be stored in tables of board-specific device descriptors, which are * declared early in board initialization and then used (much later) to * populate a controller's device tree after the that controller's driver * initializes. A secondary (and atypical) role is as a parameter to * spi_new_device() call, which happens after those controller drivers * are active in some dynamic board configuration models. */ struct spi_board_info { /* * The device name and module name are coupled, like platform_bus; * "modalias" is normally the driver name. * * platform_data goes to spi_device.dev.platform_data, * controller_data goes to spi_device.controller_data, * IRQ is copied too. */ char modalias[SPI_NAME_SIZE]; const void *platform_data; const struct software_node *swnode; void *controller_data; int irq; /* Slower signaling on noisy or low voltage boards */ u32 max_speed_hz; /* * bus_num is board specific and matches the bus_num of some * spi_controller that will probably be registered later. * * chip_select reflects how this chip is wired to that controller; * it's less than num_chipselect. */ u16 bus_num; u16 chip_select; /* * mode becomes spi_device.mode, and is essential for chips * where the default of SPI_CS_HIGH = 0 is wrong. */ u32 mode; /* * ... may need additional spi_device chip config data here. * avoid stuff protocol drivers can set; but include stuff * needed to behave without being bound to a driver: * - quirks like clock rate mattering when not selected */ }; #ifdef CONFIG_SPI extern int spi_register_board_info(struct spi_board_info const *info, unsigned n); #else /* Board init code may ignore whether SPI is configured or not */ static inline int spi_register_board_info(struct spi_board_info const *info, unsigned n) { return 0; } #endif /* * If you're hotplugging an adapter with devices (parport, USB, etc) * use spi_new_device() to describe each device. You can also call * spi_unregister_device() to start making that device vanish, but * normally that would be handled by spi_unregister_controller(). * * You can also use spi_alloc_device() and spi_add_device() to use a two * stage registration sequence for each spi_device. This gives the caller * some more control over the spi_device structure before it is registered, * but requires that caller to initialize fields that would otherwise * be defined using the board info. */ extern struct spi_device * spi_alloc_device(struct spi_controller *ctlr); extern int spi_add_device(struct spi_device *spi); extern struct spi_device * spi_new_device(struct spi_controller *, struct spi_board_info *); extern void spi_unregister_device(struct spi_device *spi); extern const struct spi_device_id * spi_get_device_id(const struct spi_device *sdev); extern const void * spi_get_device_match_data(const struct spi_device *sdev); static inline bool spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) { return list_is_last(&xfer->transfer_list, &ctlr->cur_msg->transfers); } #endif /* __LINUX_SPI_H */
3 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 /* * Update: The Berkeley copyright was changed, and the change * is retroactive to all "true" BSD software (ie everything * from UCB as opposed to other peoples code that just carried * the same license). The new copyright doesn't clash with the * GPL, so the module-only restriction has been removed.. */ /* Because this code is derived from the 4.3BSD compress source: * * Copyright (c) 1985, 1986 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * James A. Woods, derived from original work by Spencer Thomas * and Joseph Orost. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This version is for use with contiguous buffers on Linux-derived systems. * * ==FILEVERSION 20000226== * * NOTE TO MAINTAINERS: * If you modify this file at all, please set the number above to the * date of the modification as YYMMDD (year month day). * bsd_comp.c is shipped with a PPP distribution as well as with * the kernel; if everyone increases the FILEVERSION number above, * then scripts can do the right thing when deciding whether to * install a new bsd_comp.c file. Don't change the format of that * line otherwise, so the installation script can recognize it. * * From: bsd_comp.c,v 1.3 1994/12/08 01:59:58 paulus Exp */ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/ppp_defs.h> #undef PACKETPTR #define PACKETPTR 1 #include <linux/ppp-comp.h> #undef PACKETPTR #include <asm/byteorder.h> /* * PPP "BSD compress" compression * The differences between this compression and the classic BSD LZW * source are obvious from the requirement that the classic code worked * with files while this handles arbitrarily long streams that * are broken into packets. They are: * * When the code size expands, a block of junk is not emitted by * the compressor and not expected by the decompressor. * * New codes are not necessarily assigned every time an old * code is output by the compressor. This is because a packet * end forces a code to be emitted, but does not imply that a * new sequence has been seen. * * The compression ratio is checked at the first end of a packet * after the appropriate gap. Besides simplifying and speeding * things up, this makes it more likely that the transmitter * and receiver will agree when the dictionary is cleared when * compression is not going well. */ /* * Macros to extract protocol version and number of bits * from the third byte of the BSD Compress CCP configuration option. */ #define BSD_VERSION(x) ((x) >> 5) #define BSD_NBITS(x) ((x) & 0x1F) #define BSD_CURRENT_VERSION 1 /* * A dictionary for doing BSD compress. */ struct bsd_dict { union { /* hash value */ unsigned long fcode; struct { #if defined(__LITTLE_ENDIAN) /* Little endian order */ unsigned short prefix; /* preceding code */ unsigned char suffix; /* last character of new code */ unsigned char pad; #elif defined(__BIG_ENDIAN) /* Big endian order */ unsigned char pad; unsigned char suffix; /* last character of new code */ unsigned short prefix; /* preceding code */ #else #error Endianness not defined... #endif } hs; } f; unsigned short codem1; /* output of hash table -1 */ unsigned short cptr; /* map code to hash table entry */ }; struct bsd_db { int totlen; /* length of this structure */ unsigned int hsize; /* size of the hash table */ unsigned char hshift; /* used in hash function */ unsigned char n_bits; /* current bits/code */ unsigned char maxbits; /* maximum bits/code */ unsigned char debug; /* non-zero if debug desired */ unsigned char unit; /* ppp unit number */ unsigned short seqno; /* sequence # of next packet */ unsigned int mru; /* size of receive (decompress) bufr */ unsigned int maxmaxcode; /* largest valid code */ unsigned int max_ent; /* largest code in use */ unsigned int in_count; /* uncompressed bytes, aged */ unsigned int bytes_out; /* compressed bytes, aged */ unsigned int ratio; /* recent compression ratio */ unsigned int checkpoint; /* when to next check the ratio */ unsigned int clear_count; /* times dictionary cleared */ unsigned int incomp_count; /* incompressible packets */ unsigned int incomp_bytes; /* incompressible bytes */ unsigned int uncomp_count; /* uncompressed packets */ unsigned int uncomp_bytes; /* uncompressed bytes */ unsigned int comp_count; /* compressed packets */ unsigned int comp_bytes; /* compressed bytes */ unsigned short *lens; /* array of lengths of codes */ struct bsd_dict *dict; /* dictionary */ }; #define BSD_OVHD 2 /* BSD compress overhead/packet */ #define MIN_BSD_BITS 9 #define BSD_INIT_BITS MIN_BSD_BITS #define MAX_BSD_BITS 15 static void bsd_free (void *state); static void *bsd_alloc(unsigned char *options, int opt_len, int decomp); static void *bsd_comp_alloc (unsigned char *options, int opt_len); static void *bsd_decomp_alloc (unsigned char *options, int opt_len); static int bsd_init (void *db, unsigned char *options, int opt_len, int unit, int debug, int decomp); static int bsd_comp_init (void *state, unsigned char *options, int opt_len, int unit, int opthdr, int debug); static int bsd_decomp_init (void *state, unsigned char *options, int opt_len, int unit, int opthdr, int mru, int debug); static void bsd_reset (void *state); static void bsd_comp_stats (void *state, struct compstat *stats); static int bsd_compress (void *state, unsigned char *rptr, unsigned char *obuf, int isize, int osize); static void bsd_incomp (void *state, unsigned char *ibuf, int icnt); static int bsd_decompress (void *state, unsigned char *ibuf, int isize, unsigned char *obuf, int osize); /* These are in ppp_generic.c */ extern int ppp_register_compressor (struct compressor *cp); extern void ppp_unregister_compressor (struct compressor *cp); /* * the next two codes should not be changed lightly, as they must not * lie within the contiguous general code space. */ #define CLEAR 256 /* table clear output code */ #define FIRST 257 /* first free entry */ #define LAST 255 #define MAXCODE(b) ((1 << (b)) - 1) #define BADCODEM1 MAXCODE(MAX_BSD_BITS) #define BSD_HASH(prefix,suffix,hshift) ((((unsigned long)(suffix))<<(hshift)) \ ^ (unsigned long)(prefix)) #define BSD_KEY(prefix,suffix) ((((unsigned long)(suffix)) << 16) \ + (unsigned long)(prefix)) #define CHECK_GAP 10000 /* Ratio check interval */ #define RATIO_SCALE_LOG 8 #define RATIO_SCALE (1<<RATIO_SCALE_LOG) #define RATIO_MAX (0x7fffffff>>RATIO_SCALE_LOG) /* * clear the dictionary */ static void bsd_clear(struct bsd_db *db) { db->clear_count++; db->max_ent = FIRST-1; db->n_bits = BSD_INIT_BITS; db->bytes_out = 0; db->in_count = 0; db->ratio = 0; db->checkpoint = CHECK_GAP; } /* * If the dictionary is full, then see if it is time to reset it. * * Compute the compression ratio using fixed-point arithmetic * with 8 fractional bits. * * Since we have an infinite stream instead of a single file, * watch only the local compression ratio. * * Since both peers must reset the dictionary at the same time even in * the absence of CLEAR codes (while packets are incompressible), they * must compute the same ratio. */ static int bsd_check (struct bsd_db *db) /* 1=output CLEAR */ { unsigned int new_ratio; if (db->in_count >= db->checkpoint) { /* age the ratio by limiting the size of the counts */ if (db->in_count >= RATIO_MAX || db->bytes_out >= RATIO_MAX) { db->in_count -= (db->in_count >> 2); db->bytes_out -= (db->bytes_out >> 2); } db->checkpoint = db->in_count + CHECK_GAP; if (db->max_ent >= db->maxmaxcode) { /* Reset the dictionary only if the ratio is worse, * or if it looks as if it has been poisoned * by incompressible data. * * This does not overflow, because * db->in_count <= RATIO_MAX. */ new_ratio = db->in_count << RATIO_SCALE_LOG; if (db->bytes_out != 0) { new_ratio /= db->bytes_out; } if (new_ratio < db->ratio || new_ratio < 1 * RATIO_SCALE) { bsd_clear (db); return 1; } db->ratio = new_ratio; } } return 0; } /* * Return statistics. */ static void bsd_comp_stats (void *state, struct compstat *stats) { struct bsd_db *db = (struct bsd_db *) state; stats->unc_bytes = db->uncomp_bytes; stats->unc_packets = db->uncomp_count; stats->comp_bytes = db->comp_bytes; stats->comp_packets = db->comp_count; stats->inc_bytes = db->incomp_bytes; stats->inc_packets = db->incomp_count; stats->in_count = db->in_count; stats->bytes_out = db->bytes_out; } /* * Reset state, as on a CCP ResetReq. */ static void bsd_reset (void *state) { struct bsd_db *db = (struct bsd_db *) state; bsd_clear(db); db->seqno = 0; db->clear_count = 0; } /* * Release the compression structure */ static void bsd_free (void *state) { struct bsd_db *db = state; if (!db) return; /* * Release the dictionary */ vfree(db->dict); db->dict = NULL; /* * Release the string buffer */ vfree(db->lens); db->lens = NULL; /* * Finally release the structure itself. */ kfree(db); } /* * Allocate space for a (de) compressor. */ static void *bsd_alloc (unsigned char *options, int opt_len, int decomp) { int bits; unsigned int hsize, hshift, maxmaxcode; struct bsd_db *db; if (opt_len != 3 || options[0] != CI_BSD_COMPRESS || options[1] != 3 || BSD_VERSION(options[2]) != BSD_CURRENT_VERSION) { return NULL; } bits = BSD_NBITS(options[2]); switch (bits) { case 9: /* needs 82152 for both directions */ case 10: /* needs 84144 */ case 11: /* needs 88240 */ case 12: /* needs 96432 */ hsize = 5003; hshift = 4; break; case 13: /* needs 176784 */ hsize = 9001; hshift = 5; break; case 14: /* needs 353744 */ hsize = 18013; hshift = 6; break; case 15: /* needs 691440 */ hsize = 35023; hshift = 7; break; case 16: /* needs 1366160--far too much, */ /* hsize = 69001; */ /* and 69001 is too big for cptr */ /* hshift = 8; */ /* in struct bsd_db */ /* break; */ default: return NULL; } /* * Allocate the main control structure for this instance. */ maxmaxcode = MAXCODE(bits); db = kzalloc(sizeof (struct bsd_db), GFP_KERNEL); if (!db) { return NULL; } /* * Allocate space for the dictionary. This may be more than one page in * length. */ db->dict = vmalloc(array_size(hsize, sizeof(struct bsd_dict))); if (!db->dict) { bsd_free (db); return NULL; } /* * If this is the compression buffer then there is no length data. */ if (!decomp) { db->lens = NULL; } /* * For decompression, the length information is needed as well. */ else { db->lens = vmalloc(array_size(sizeof(db->lens[0]), (maxmaxcode + 1))); if (!db->lens) { bsd_free (db); return NULL; } } /* * Initialize the data information for the compression code */ db->totlen = sizeof (struct bsd_db) + (sizeof (struct bsd_dict) * hsize); db->hsize = hsize; db->hshift = hshift; db->maxmaxcode = maxmaxcode; db->maxbits = bits; return (void *) db; } static void *bsd_comp_alloc (unsigned char *options, int opt_len) { return bsd_alloc (options, opt_len, 0); } static void *bsd_decomp_alloc (unsigned char *options, int opt_len) { return bsd_alloc (options, opt_len, 1); } /* * Initialize the database. */ static int bsd_init (void *state, unsigned char *options, int opt_len, int unit, int debug, int decomp) { struct bsd_db *db = state; int indx; if ((opt_len != 3) || (options[0] != CI_BSD_COMPRESS) || (options[1] != 3) || (BSD_VERSION(options[2]) != BSD_CURRENT_VERSION) || (BSD_NBITS(options[2]) != db->maxbits) || (decomp && db->lens == NULL)) { return 0; } if (decomp) { indx = LAST; do { db->lens[indx] = 1; } while (indx-- > 0); } indx = db->hsize; while (indx-- != 0) { db->dict[indx].codem1 = BADCODEM1; db->dict[indx].cptr = 0; } db->unit = unit; db->mru = 0; #ifndef DEBUG if (debug) #endif db->debug = 1; bsd_reset(db); return 1; } static int bsd_comp_init (void *state, unsigned char *options, int opt_len, int unit, int opthdr, int debug) { return bsd_init (state, options, opt_len, unit, debug, 0); } static int bsd_decomp_init (void *state, unsigned char *options, int opt_len, int unit, int opthdr, int mru, int debug) { return bsd_init (state, options, opt_len, unit, debug, 1); } /* * Obtain pointers to the various structures in the compression tables */ #define dict_ptrx(p,idx) &(p->dict[idx]) #define lens_ptrx(p,idx) &(p->lens[idx]) #ifdef DEBUG static unsigned short *lens_ptr(struct bsd_db *db, int idx) { if ((unsigned int) idx > (unsigned int) db->maxmaxcode) { printk ("<9>ppp: lens_ptr(%d) > max\n", idx); idx = 0; } return lens_ptrx (db, idx); } static struct bsd_dict *dict_ptr(struct bsd_db *db, int idx) { if ((unsigned int) idx >= (unsigned int) db->hsize) { printk ("<9>ppp: dict_ptr(%d) > max\n", idx); idx = 0; } return dict_ptrx (db, idx); } #else #define lens_ptr(db,idx) lens_ptrx(db,idx) #define dict_ptr(db,idx) dict_ptrx(db,idx) #endif /* * compress a packet * * The result of this function is the size of the compressed * packet. A zero is returned if the packet was not compressed * for some reason, such as the size being larger than uncompressed. * * One change from the BSD compress command is that when the * code size expands, we do not output a bunch of padding. */ static int bsd_compress (void *state, unsigned char *rptr, unsigned char *obuf, int isize, int osize) { struct bsd_db *db; int hshift; unsigned int max_ent; unsigned int n_bits; unsigned int bitno; unsigned long accm; int ent; unsigned long fcode; struct bsd_dict *dictp; unsigned char c; int hval; int disp; int ilen; int mxcode; unsigned char *wptr; int olen; #define PUTBYTE(v) \ { \ ++olen; \ if (wptr) \ { \ *wptr++ = (unsigned char) (v); \ if (olen >= osize) \ { \ wptr = NULL; \ } \ } \ } #define OUTPUT(ent) \ { \ bitno -= n_bits; \ accm |= ((ent) << bitno); \ do \ { \ PUTBYTE(accm >> 24); \ accm <<= 8; \ bitno += 8; \ } \ while (bitno <= 24); \ } /* * If the protocol is not in the range we're interested in, * just return without compressing the packet. If it is, * the protocol becomes the first byte to compress. */ ent = PPP_PROTOCOL(rptr); if (ent < 0x21 || ent > 0xf9) { return 0; } db = (struct bsd_db *) state; hshift = db->hshift; max_ent = db->max_ent; n_bits = db->n_bits; bitno = 32; accm = 0; mxcode = MAXCODE (n_bits); /* Initialize the output pointers */ wptr = obuf; olen = PPP_HDRLEN + BSD_OVHD; if (osize > isize) { osize = isize; } /* This is the PPP header information */ if (wptr) { *wptr++ = PPP_ADDRESS(rptr); *wptr++ = PPP_CONTROL(rptr); *wptr++ = 0; *wptr++ = PPP_COMP; *wptr++ = db->seqno >> 8; *wptr++ = db->seqno; } /* Skip the input header */ rptr += PPP_HDRLEN; isize -= PPP_HDRLEN; ilen = ++isize; /* Low byte of protocol is counted as input */ while (--ilen > 0) { c = *rptr++; fcode = BSD_KEY (ent, c); hval = BSD_HASH (ent, c, hshift); dictp = dict_ptr (db, hval); /* Validate and then check the entry. */ if (dictp->codem1 >= max_ent) { goto nomatch; } if (dictp->f.fcode == fcode) { ent = dictp->codem1 + 1; continue; /* found (prefix,suffix) */ } /* continue probing until a match or invalid entry */ disp = (hval == 0) ? 1 : hval; do { hval += disp; if (hval >= db->hsize) { hval -= db->hsize; } dictp = dict_ptr (db, hval); if (dictp->codem1 >= max_ent) { goto nomatch; } } while (dictp->f.fcode != fcode); ent = dictp->codem1 + 1; /* finally found (prefix,suffix) */ continue; nomatch: OUTPUT(ent); /* output the prefix */ /* code -> hashtable */ if (max_ent < db->maxmaxcode) { struct bsd_dict *dictp2; struct bsd_dict *dictp3; int indx; /* expand code size if needed */ if (max_ent >= mxcode) { db->n_bits = ++n_bits; mxcode = MAXCODE (n_bits); } /* Invalidate old hash table entry using * this code, and then take it over. */ dictp2 = dict_ptr (db, max_ent + 1); indx = dictp2->cptr; dictp3 = dict_ptr (db, indx); if (dictp3->codem1 == max_ent) { dictp3->codem1 = BADCODEM1; } dictp2->cptr = hval; dictp->codem1 = max_ent; dictp->f.fcode = fcode; db->max_ent = ++max_ent; if (db->lens) { unsigned short *len1 = lens_ptr (db, max_ent); unsigned short *len2 = lens_ptr (db, ent); *len1 = *len2 + 1; } } ent = c; } OUTPUT(ent); /* output the last code */ db->bytes_out += olen - PPP_HDRLEN - BSD_OVHD; db->uncomp_bytes += isize; db->in_count += isize; ++db->uncomp_count; ++db->seqno; if (bitno < 32) { ++db->bytes_out; /* must be set before calling bsd_check */ } /* * Generate the clear command if needed */ if (bsd_check(db)) { OUTPUT (CLEAR); } /* * Pad dribble bits of last code with ones. * Do not emit a completely useless byte of ones. */ if (bitno != 32) { PUTBYTE((accm | (0xff << (bitno-8))) >> 24); } /* * Increase code size if we would have without the packet * boundary because the decompressor will do so. */ if (max_ent >= mxcode && max_ent < db->maxmaxcode) { db->n_bits++; } /* If output length is too large then this is an incomplete frame. */ if (wptr == NULL) { ++db->incomp_count; db->incomp_bytes += isize; olen = 0; } else /* Count the number of compressed frames */ { ++db->comp_count; db->comp_bytes += olen; } /* Return the resulting output length */ return olen; #undef OUTPUT #undef PUTBYTE } /* * Update the "BSD Compress" dictionary on the receiver for * incompressible data by pretending to compress the incoming data. */ static void bsd_incomp (void *state, unsigned char *ibuf, int icnt) { (void) bsd_compress (state, ibuf, (char *) 0, icnt, 0); } /* * Decompress "BSD Compress". * * Because of patent problems, we return DECOMP_ERROR for errors * found by inspecting the input data and for system problems, but * DECOMP_FATALERROR for any errors which could possibly be said to * be being detected "after" decompression. For DECOMP_ERROR, * we can issue a CCP reset-request; for DECOMP_FATALERROR, we may be * infringing a patent of Motorola's if we do, so we take CCP down * instead. * * Given that the frame has the correct sequence number and a good FCS, * errors such as invalid codes in the input most likely indicate a * bug, so we return DECOMP_FATALERROR for them in order to turn off * compression, even though they are detected by inspecting the input. */ static int bsd_decompress (void *state, unsigned char *ibuf, int isize, unsigned char *obuf, int osize) { struct bsd_db *db; unsigned int max_ent; unsigned long accm; unsigned int bitno; /* 1st valid bit in accm */ unsigned int n_bits; unsigned int tgtbitno; /* bitno when we have a code */ struct bsd_dict *dictp; int explen; int seq; unsigned int incode; unsigned int oldcode; unsigned int finchar; unsigned char *p; unsigned char *wptr; int adrs; int ctrl; int ilen; int codelen; int extra; db = (struct bsd_db *) state; max_ent = db->max_ent; accm = 0; bitno = 32; /* 1st valid bit in accm */ n_bits = db->n_bits; tgtbitno = 32 - n_bits; /* bitno when we have a code */ /* * Save the address/control from the PPP header * and then get the sequence number. */ adrs = PPP_ADDRESS (ibuf); ctrl = PPP_CONTROL (ibuf); seq = (ibuf[4] << 8) + ibuf[5]; ibuf += (PPP_HDRLEN + 2); ilen = isize - (PPP_HDRLEN + 2); /* * Check the sequence number and give up if it differs from * the value we're expecting. */ if (seq != db->seqno) { if (db->debug) { printk("bsd_decomp%d: bad sequence # %d, expected %d\n", db->unit, seq, db->seqno - 1); } return DECOMP_ERROR; } ++db->seqno; db->bytes_out += ilen; /* * Fill in the ppp header, but not the last byte of the protocol * (that comes from the decompressed data). */ wptr = obuf; *wptr++ = adrs; *wptr++ = ctrl; *wptr++ = 0; oldcode = CLEAR; explen = 3; /* * Keep the checkpoint correctly so that incompressible packets * clear the dictionary at the proper times. */ for (;;) { if (ilen-- <= 0) { db->in_count += (explen - 3); /* don't count the header */ break; } /* * Accumulate bytes until we have a complete code. * Then get the next code, relying on the 32-bit, * unsigned accm to mask the result. */ bitno -= 8; accm |= *ibuf++ << bitno; if (tgtbitno < bitno) { continue; } incode = accm >> tgtbitno; accm <<= n_bits; bitno += n_bits; /* * The dictionary must only be cleared at the end of a packet. */ if (incode == CLEAR) { if (ilen > 0) { if (db->debug) { printk("bsd_decomp%d: bad CLEAR\n", db->unit); } return DECOMP_FATALERROR; /* probably a bug */ } bsd_clear(db); break; } if ((incode > max_ent + 2) || (incode > db->maxmaxcode) || (incode > max_ent && oldcode == CLEAR)) { if (db->debug) { printk("bsd_decomp%d: bad code 0x%x oldcode=0x%x ", db->unit, incode, oldcode); printk("max_ent=0x%x explen=%d seqno=%d\n", max_ent, explen, db->seqno); } return DECOMP_FATALERROR; /* probably a bug */ } /* Special case for KwKwK string. */ if (incode > max_ent) { finchar = oldcode; extra = 1; } else { finchar = incode; extra = 0; } codelen = *(lens_ptr (db, finchar)); explen += codelen + extra; if (explen > osize) { if (db->debug) { printk("bsd_decomp%d: ran out of mru\n", db->unit); #ifdef DEBUG printk(" len=%d, finchar=0x%x, codelen=%d, explen=%d\n", ilen, finchar, codelen, explen); #endif } return DECOMP_FATALERROR; } /* * Decode this code and install it in the decompressed buffer. */ wptr += codelen; p = wptr; while (finchar > LAST) { struct bsd_dict *dictp2 = dict_ptr (db, finchar); dictp = dict_ptr (db, dictp2->cptr); #ifdef DEBUG if (--codelen <= 0 || dictp->codem1 != finchar-1) { if (codelen <= 0) { printk("bsd_decomp%d: fell off end of chain ", db->unit); printk("0x%x at 0x%x by 0x%x, max_ent=0x%x\n", incode, finchar, dictp2->cptr, max_ent); } else { if (dictp->codem1 != finchar-1) { printk("bsd_decomp%d: bad code chain 0x%x " "finchar=0x%x ", db->unit, incode, finchar); printk("oldcode=0x%x cptr=0x%x codem1=0x%x\n", oldcode, dictp2->cptr, dictp->codem1); } } return DECOMP_FATALERROR; } #endif *--p = dictp->f.hs.suffix; finchar = dictp->f.hs.prefix; } *--p = finchar; #ifdef DEBUG if (--codelen != 0) { printk("bsd_decomp%d: short by %d after code 0x%x, max_ent=0x%x\n", db->unit, codelen, incode, max_ent); } #endif if (extra) /* the KwKwK case again */ { *wptr++ = finchar; } /* * If not first code in a packet, and * if not out of code space, then allocate a new code. * * Keep the hash table correct so it can be used * with uncompressed packets. */ if (oldcode != CLEAR && max_ent < db->maxmaxcode) { struct bsd_dict *dictp2, *dictp3; unsigned short *lens1, *lens2; unsigned long fcode; int hval, disp, indx; fcode = BSD_KEY(oldcode,finchar); hval = BSD_HASH(oldcode,finchar,db->hshift); dictp = dict_ptr (db, hval); /* look for a free hash table entry */ if (dictp->codem1 < max_ent) { disp = (hval == 0) ? 1 : hval; do { hval += disp; if (hval >= db->hsize) { hval -= db->hsize; } dictp = dict_ptr (db, hval); } while (dictp->codem1 < max_ent); } /* * Invalidate previous hash table entry * assigned this code, and then take it over */ dictp2 = dict_ptr (db, max_ent + 1); indx = dictp2->cptr; dictp3 = dict_ptr (db, indx); if (dictp3->codem1 == max_ent) { dictp3->codem1 = BADCODEM1; } dictp2->cptr = hval; dictp->codem1 = max_ent; dictp->f.fcode = fcode; db->max_ent = ++max_ent; /* Update the length of this string. */ lens1 = lens_ptr (db, max_ent); lens2 = lens_ptr (db, oldcode); *lens1 = *lens2 + 1; /* Expand code size if needed. */ if (max_ent >= MAXCODE(n_bits) && max_ent < db->maxmaxcode) { db->n_bits = ++n_bits; tgtbitno = 32-n_bits; } } oldcode = incode; } ++db->comp_count; ++db->uncomp_count; db->comp_bytes += isize - BSD_OVHD - PPP_HDRLEN; db->uncomp_bytes += explen; if (bsd_check(db)) { if (db->debug) { printk("bsd_decomp%d: peer should have cleared dictionary on %d\n", db->unit, db->seqno - 1); } } return explen; } /************************************************************* * Table of addresses for the BSD compression module *************************************************************/ static struct compressor ppp_bsd_compress = { .compress_proto = CI_BSD_COMPRESS, .comp_alloc = bsd_comp_alloc, .comp_free = bsd_free, .comp_init = bsd_comp_init, .comp_reset = bsd_reset, .compress = bsd_compress, .comp_stat = bsd_comp_stats, .decomp_alloc = bsd_decomp_alloc, .decomp_free = bsd_free, .decomp_init = bsd_decomp_init, .decomp_reset = bsd_reset, .decompress = bsd_decompress, .incomp = bsd_incomp, .decomp_stat = bsd_comp_stats, .owner = THIS_MODULE }; /************************************************************* * Module support routines *************************************************************/ static int __init bsdcomp_init(void) { int answer = ppp_register_compressor(&ppp_bsd_compress); if (answer == 0) printk(KERN_INFO "PPP BSD Compression module registered\n"); return answer; } static void __exit bsdcomp_cleanup(void) { ppp_unregister_compressor(&ppp_bsd_compress); } module_init(bsdcomp_init); module_exit(bsdcomp_cleanup); MODULE_DESCRIPTION("PPP BSD-Compress compression module"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("ppp-compress-" __stringify(CI_BSD_COMPRESS));
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 // SPDX-License-Identifier: GPL-2.0-only /* * IPV4 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/udp.h> static struct sk_buff *esp4_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct iphdr *iph = ip_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); int proto = iph->protocol; skb_push(skb, -skb_network_offset(skb)); esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } if (proto == IPPROTO_IPV6) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm4_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm4_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp_input_done2(skb, 0); } static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int err; int alen; int blksize; struct xfrm_offload *xo; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; int encap_type = 0; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if ((!(features & NETIF_F_HW_ESP) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; esp.esph = ip_esp_hdr(skb); if (x->encap) encap_type = x->encap->encap_type; if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) { /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header * points to iphdr->protocol (see xfrm4_tunnel_encap_add()). * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol. * Therefore, the protocol field needs to be corrected. */ ip_hdr(skb)->protocol = IPPROTO_UDP; esph->seq_no = htonl(seq); } ip_hdr(skb)->tot_len = htons(skb->len); ip_send_check(ip_hdr(skb)); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, .gso_segment = esp4_gso_segment, }, }; static const struct xfrm_type_offload esp_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, .xmit = esp_xmit, .encap = esp4_gso_encap, }; static int __init esp4_offload_init(void) { if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } module_init(esp4_offload_init); module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> */ #ifndef _LINUX_INTEGRITY_H #define _LINUX_INTEGRITY_H #include <linux/fs.h> #include <linux/iversion.h> enum integrity_status { INTEGRITY_PASS = 0, INTEGRITY_PASS_IMMUTABLE, INTEGRITY_FAIL, INTEGRITY_FAIL_IMMUTABLE, INTEGRITY_NOLABEL, INTEGRITY_NOXATTRS, INTEGRITY_UNKNOWN, }; #ifdef CONFIG_INTEGRITY extern void __init integrity_load_keys(void); #else static inline void integrity_load_keys(void) { } #endif /* CONFIG_INTEGRITY */ /* An inode's attributes for detection of changes */ struct integrity_inode_attributes { u64 version; /* track inode changes */ unsigned long ino; dev_t dev; }; /* * On stacked filesystems the i_version alone is not enough to detect file data * or metadata change. Additional metadata is required. */ static inline void integrity_inode_attrs_store(struct integrity_inode_attributes *attrs, u64 i_version, const struct inode *inode) { attrs->version = i_version; attrs->dev = inode->i_sb->s_dev; attrs->ino = inode->i_ino; } /* * On stacked filesystems detect whether the inode or its content has changed. */ static inline bool integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs, const struct inode *inode) { return (inode->i_sb->s_dev != attrs->dev || inode->i_ino != attrs->ino || !inode_eq_iversion(inode, attrs->version)); } #endif /* _LINUX_INTEGRITY_H */
19 19 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <linux/freezer.h> #include <trace/events/ext4.h> #include <kunit/static_stub.h> /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) * * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) * * This is an array of lists where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total * number of buddy bitmap orders possible) number of lists. Group-infos are * placed in appropriate lists. * * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) * * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) * * This is an array of lists where in the i-th list there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. * Note that we don't bother with a special list for completely empty groups * so we only have MB_NUM_ORDERS(sb) lists. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order * >= the order of the request. We directly look at the largest free order list * in the data structure (1) above where largest_free_order = order of the * request. If that list is empty, we look at remaining list in the increasing * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED * lookup in O(1) time. * * At CR_GOAL_LEN_FAST, we only consider groups where * average fragment size > request size. So, we lookup a group which has average * fragment size just above or equal to request size using our average fragment * size group lists (data structure 2) in O(1) time. * * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in * CR_GOAL_LEN_FAST suggests that there is no BG that has avg * fragment size > goal length. So before falling to the slower * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big * enough average fragment size. This increases the chances of finding a * suitable block group in O(1) time and results in faster allocation at the * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices * this is set to MB_DEFAULT_LINEAR_LIMIT. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * - cr_power2_aligned lists lock (cr_power2_aligned) * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * * - allocation path (ext4_mb_regular_allocator) * group * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks); /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block * allocation in ext4_mb_new_blocks(). * 2. We increment this percpu discard_pa_seq counter when we either allocate * or free these blocks i.e. while marking those blocks as used/free in * mb_mark_used()/mb_free_blocks(). * 3. We also increment this percpu seq counter when we successfully identify * that the bb_prealloc_list is not empty and hence proceed for discarding * of those PAs inside ext4_mb_discard_group_preallocations(). * * Now to make sure that the regular fast path of block allocation is not * affected, as a small optimization we only sample the percpu seq counter * on that cpu. Only when the block allocation fails and when freed blocks * found were 0, that is when we sample percpu seq counter for all cpus using * below function ext4_get_discard_pa_seq_sum(). This happens after making * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. */ static DEFINE_PER_CPU(u64, discard_pa_seq); static inline u64 ext4_get_discard_pa_seq_sum(void) { int __cpu; u64 __seq = 0; for_each_possible_cpu(__cpu) __seq += per_cpu(discard_pa_seq, __cpu); return __seq; } static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { *max = 0; return NULL; } /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); if (!grp->bb_bitmap) return; bh = ext4_read_block_bitmap(sb, group); if (IS_ERR_OR_NULL(bh)) { kfree(grp->bb_bitmap); grp->bb_bitmap = NULL; return; } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { kfree(grp->bb_bitmap); } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { return; } static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } continue; } /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( !mb_test_bit(k, e4b->bd_bitmap)); } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } continue; } fstart = -1; /* check used bits only */ for (j = 0; j < e4b->bd_blkbits + 1; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); len -= chunk; first += chunk; } } static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { int order; /* * We don't bother with a special lists groups with only 1 block free * extents and for completely empty groups. */ order = fls(len) - 2; if (order < 0) return 0; if (order == MB_NUM_ORDERS(sb)) order--; if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) order = MB_NUM_ORDERS(sb) - 1; return order; } /* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new_order; if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) return; new_order = mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); if (new_order == grp->bb_avg_fragment_size_order) return; if (grp->bb_avg_fragment_size_order != -1) { write_lock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); list_del(&grp->bb_avg_fragment_size_node); write_unlock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); } grp->bb_avg_fragment_size_order = new_order; write_lock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); list_add_tail(&grp->bb_avg_fragment_size_node, &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); write_unlock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) return; if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; read_lock(&sbi->s_mb_largest_free_orders_locks[i]); if (list_empty(&sbi->s_mb_largest_free_orders[i])) { read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { *group = iter->bb_group; ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); } /* Increment cr and search again if no group is found */ *new_cr = CR_GOAL_LEN_FAST; } /* * Find a suitable group of given order from the average fragments list. */ static struct ext4_group_info * ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; struct ext4_group_info *grp = NULL, *iter; enum criteria cr = ac->ac_criteria; if (list_empty(frag_list)) return NULL; read_lock(frag_list_lock); if (list_empty(frag_list)) { read_unlock(frag_list_lock); return NULL; } list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { grp = iter; break; } } read_unlock(frag_list_lock); return grp; } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; int i; if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); } for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; return; } } /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to * a smaller goal len such that it can still satisfy original * request len. However, allocation request for non-regular * files never gets normalized. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) *new_cr = CR_BEST_AVAIL_LEN; else *new_cr = CR_GOAL_LEN_SLOW; } /* * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; int i, order, min_order; unsigned long num_stripe_clusters = 0; if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); } /* * mb_avg_fragment_size_order() returns order in a way that makes * retrieving back the length using (1 << order) inaccurate. Hence, use * fls() instead since we need to know the actual length while modifying * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of * cluster ratio otherwise __ext4_fill_super exists early. */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) /* * We consider 1 order less because later we round * up the goal len to num_stripe_clusters */ min_order = fls(num_stripe_clusters) - 1; } if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); for (i = order; i >= min_order; i--) { int frag_order; /* * Scale down goal len to make sure we find something * in the free fragments list. Basically, reduce * preallocations. */ ac->ac_g_ex.fe_len = 1 << i; if (num_stripe_clusters > 0) { /* * Try to round up the adjusted goal length to * stripe size (in cluster units) multiple for * efficiency. */ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, num_stripe_clusters); } frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; return; } } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1; } /* * Return next linear group for allocation. */ static ext4_group_t next_linear_group(ext4_group_t group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ return group + 1 >= ngroups ? 0 : group + 1; } /* * ext4_mb_choose_next_group: choose next group for allocation. * * @ac Allocation Context * @new_cr This is an output parameter. If the there is no good group * available at current CR level, this field is updated to indicate * the new cr level that should be used. * @group This is an input / output parameter. As an input it indicates the * next group that the allocator intends to use for allocation. As * output, this field indicates the next group that should be used as * determined by the optimization functions. * @ngroups Total number of groups */ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { *new_cr = ac->ac_criteria; if (!should_optimize_scan(ac)) { *group = next_linear_group(*group, ngroups); return; } /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ if (ac->ac_groups_linear_remaining) { *group = next_linear_group(*group, ngroups); ac->ac_groups_linear_remaining--; return; } if (*new_cr == CR_POWER2_ALIGNED) { ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should * never come here. */ WARN_ON(1); } } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int i; for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) if (grp->bb_counters[i] > 0) break; /* No need to move between order lists? */ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || i == grp->bb_largest_free_order) { grp->bb_largest_free_order = i; return; } if (grp->bb_largest_free_order >= 0) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_del_init(&grp->bb_largest_free_order_node); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } grp->bb_largest_free_order = i; if (grp->bb_largest_free_order >= 0 && grp->bb_free) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_add_tail(&grp->bb_largest_free_order_node, &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else grp->bb_counters[0]++; if (i < max) i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; while ((buddy = mb_find_buddy(e4b, order++, &count))) mb_set_bits(buddy, 0, count); e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * * Locking note: This routine takes the block group lock of all groups * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; first_group = folio->index * blocks_per_page / 2; /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; grinfo = ext4_get_group_info(sb, group); if (!grinfo) continue; /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ if (folio_test_uptodate(folio) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { int err2; if (!bh[i]) continue; err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ grinfo = ext4_get_group_info(sb, group); if (!grinfo) { err = -EFSCORRUPTED; goto out; } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } folio_mark_uptodate(folio); out: if (bh) { for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); } return err; } /* * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0; } /* blocks_per_page == 1, hence we need another page for the buddy */ folio = __filemap_get_folio(inode->i_mapping, block + 1, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); folio_put(e4b->bd_bitmap_folio); } if (e4b->bd_buddy_folio) { folio_unlock(e4b->bd_buddy_folio); folio_put(e4b->bd_buddy_folio); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct folio *folio; int ret = 0; might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); if (!this_grp) return -EFSCORRUPTED; /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } folio = e4b.bd_bitmap_folio; ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ folio = e4b.bd_buddy_folio; ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } err: ext4_mb_put_buddy_page_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; int pnum; int poff; struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(sb, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) /* * drop the folio reference and try * to get the folio with lock. If we * are not uptodate that implies * somebody just created the folio but * is yet to initialize it. So * wait for it to initialize. */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { folio_unlock(folio); goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { folio_unlock(folio); goto err; } } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: if (!IS_ERR_OR_NULL(folio)) folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); if (e4b->bd_buddy_folio) folio_put(e4b->bd_buddy_folio); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1, max; void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); while (order <= e4b->bd_blkbits + 1) { bb = mb_find_buddy(e4b, order, &max); if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order; } order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } mb_clear_bit(cur, bm); cur++; } } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); *addr = 0; cur += 32; continue; } if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; cur++; } return zero_bit; } void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } mb_set_bit(cur, bm); cur++; } } static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ if (first & 1) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1)) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break; order++; buddy2 = mb_find_buddy(e4b, order, &max); if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; } first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ if (first != 0) left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; /* * Fastcommit replay can free already freed blocks which * corrupts allocation info. Regenerate it. */ if (sbi->s_mount_state & EXT4_FC_REPLAY) { mb_regenerate_buddy(e4b); goto check; } blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block (bit %u); block bitmap corrupt.", block); return; } this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ if (first & 1) { first += !left_is_free; e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } if (!(last & 1)) { last -= !right_is_free; e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); check: mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; return 0; } /* find actual order */ order = mb_find_order_for_block(e4b, block); ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); ex->fe_start = block; ex->fe_group = e4b->bd_group; block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", block, order, needed, ex->fe_group, ex->fe_start, ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; } return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; int max = 0; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ while (len) { ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ if (ret == 0) ret = len | (ord << 16); BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord_start = (start >> ord) << ord; ord_end = ord_start + (1 << ord); /* first chunk */ if (start > ord_start) ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, ord_start, start - ord_start, e4b->bd_info); /* last chunk */ if (start + len < ord_end) { ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, start + len, ord_end - (start + len), e4b->bd_info); break; } len = start + len - ord_end; start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the page reference. We want the page to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_folio = e4b->bd_bitmap_folio; folio_get(ac->ac_bitmap_folio); ac->ac_buddy_folio = e4b->bd_buddy_folio; folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } /* * As we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor. */ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ if (bex->fe_len < gex->fe_len) return; if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) ext4_mb_use_best_found(ac, e4b); } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * The algorithm used is roughly as follows: * * * If free extent found is exactly as big as goal, then * stop the scan and use it immediately * * * If free extent found is smaller than goal, then keep retrying * upto a max of sbi->s_mb_max_to_scan times (default 200). After * that stop scanning and use whatever we have. * * * If free extent found is bigger than goal, then keep retrying * upto a max of sbi->s_mb_min_to_scan times (default 10) before * stopping the scan and using the extent. * * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ if (bex->fe_len == 0) { *bex = *ex; return; } /* * If new found extent is better, store it in the context */ if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ if (ex->fe_len > bex->fe_len) *bex = *ex; } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ if (ex->fe_len < bex->fe_len) *bex = *ex; } ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!grp) return -EFSCORRUPTED; if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; buddy = mb_find_buddy(e4b, i, &max); if (WARN_RATELIMIT(buddy == NULL, "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { ext4_mark_group_bitmap_corrupted(ac->ac_sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); break; } ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i, j, freelen; int free; free = e4b->bd_info->bb_free; if (WARN_ON(free <= 0)) return; i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * have free blocks */ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); break; } if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough * continuous free extent, so skip over the smaller free * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); freelen = j - i; if (freelen < ac->ac_g_ex.fe_len) { i = j; free -= freelen; continue; } } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; free -= ex.fe_len; } ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += stripe; } } /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; if (free == 0) return false; fragments = grp->bb_fragments; if (fragments == 0) return false; switch (cr) { case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return false; if (free < ac->ac_g_ex.fe_len) return false; if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) return false; return true; case CR_GOAL_LEN_FAST: case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; case CR_ANY_FREE: return true; default: BUG(); } return false; } /* * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. * * Note: because we are conditionally operating with the group lock in * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this * function using __acquire and __release. This means we need to be * super careful before messing with the error path handling via "goto * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; if (!grp) return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } free = grp->bb_free; if (free == 0) goto out; /* * In all criterias except CR_ANY_FREE we try to avoid groups that * can't possibly satisfy the full goal request due to insufficient * free blocks. */ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; /* * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this * gets used for metadata block allocation, and we want to make * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) return 0; ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } ret = ext4_mb_good_group(ac, group, cr); out: if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } return ret; } /* * Start prefetching @nr block bitmaps starting at @group. * Return the next group which needs to be prefetched. */ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct buffer_head *bh; struct blk_plug plug; blk_start_plug(&plug); while (nr-- > 0) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); struct ext4_group_info *grp = ext4_get_group_info(sb, group); /* * Prefetch block groups with free blocks; but don't * bother if it is marked uninitialized on disk, since * it won't require I/O to read. Also only try to * prefetch once, so we avoid getblk() call, which can * be expensive. */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); } } if (++group >= ngroups) group = 0; } blk_finish_plug(&plug); return group; } /* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O * is not yet completed, or indeed if it was not initiated by * ext4_mb_prefetch did not start the I/O. * * TODO: We should actually kick off the buddy bitmap setup in a work * queue when the buffer I/O is completed, so that we don't block * waiting for the block allocation bitmap read to finish when * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). */ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { struct ext4_group_desc *gdp; struct ext4_group_info *grp; while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } } } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; enum criteria new_cr, cr = CR_GOAL_LEN_FAST; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; int lost; sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ err = ext4_mb_find_by_goal(ac, &e4b); if (err || ac->ac_status == AC_STATUS_FOUND) goto out; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac_2order is set only if the fe_len is a power of 2 * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } /* * Let's just scan groups to find more-less suitable blocks We * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ if (ac->ac_2order) cr = CR_POWER2_ALIGNED; repeat: for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start * from the goal value specified */ group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { int ret = 0; cond_resched(); if (new_cr != cr) { cr = new_cr; goto repeat; } /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit * prefetching at inexpensive CR, otherwise mballoc * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { nr = 1 << sbi->s_log_groups_per_flex; nr -= group & (nr - 1); nr = min(nr, sbi->s_mb_prefetch); } prefetch_grp = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); } /* This now checks without needing the buddy page */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!first_err) first_err = ret; continue; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; ext4_lock_group(sb, group); /* * We need to check again after locking the * block group */ ret = ext4_mb_good_group(ac, group, cr); if (ret == 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); continue; } ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); else { bool is_stripe_aligned = (sbi->s_stripe >= sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)); if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && is_stripe_aligned) ext4_mb_scan_aligned(ac, &e4b); if (ac->ac_status == AC_STATUS_CONTINUE) ext4_mb_complex_scan_group(ac, &e4b); } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; } /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) /* Reset goal length to original goal length before * falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) */ lost = atomic_inc_return(&sbi->s_mb_lost_chunks); mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, lost); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; cr = CR_ANY_FREE; goto repeat; } } if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, cr, err); if (nr) ext4_mb_prefetch_fini(sb, prefetch_grp, nr); return err; } static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i, err; char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); if (!grinfo) return 0; /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } ext4_mb_unload_buddy(&e4b); } /* * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ memcpy(sg, grinfo, i); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg->bb_counters[i] : 0); seq_puts(seq, " ]"); if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; } static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct ext4_sb_info *sbi = EXT4_SB(sb); seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); seq_puts( seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); /* CR_POWER2_ALIGNED stats */ seq_puts(seq, "\tcr_p2_aligned_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); /* CR_ANY_FREE stats */ seq_puts(seq, "\tcr_any_free_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); /* Aggregates */ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); seq_printf(seq, "\tbuddies_time_used: %llu\n", atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; position--; if (position >= MB_NUM_ORDERS(sb)) { position -= MB_NUM_ORDERS(sb); if (position == 0) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], bb_avg_fragment_size_node) count++; read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } if (position == 0) { seq_printf(seq, "optimize_scan: %d\n", test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); seq_puts(seq, "max_free_order_lists:\n"); } count = 0; read_lock(&sbi->s_mb_largest_free_orders_locks[position]); list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], bb_largest_free_order_node) count++; read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_structs_summary_ops = { .start = ext4_mb_seq_structs_summary_start, .next = ext4_mb_seq_structs_summary_next, .stop = ext4_mb_seq_structs_summary_stop, .show = ext4_mb_seq_structs_summary_show, }; static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; BUG_ON(!cachep); return cachep; } /* * Allocate the top-level s_group_info array for the specified number * of groups */ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; struct ext4_group_info ***old_groupinfo, ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); if (size <= sbi->s_group_info_size) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } rcu_read_lock(); old_groupinfo = rcu_dereference(sbi->s_group_info); if (old_groupinfo) memcpy(new_groupinfo, old_groupinfo, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); rcu_read_unlock(); rcu_assign_pointer(sbi->s_group_info, new_groupinfo); sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i; int metalen = 0; int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. * If it's true, we have to allocate a new table of pointers * to ext4_group_info structures */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; rcu_read_unlock(); } meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); /* * initialize bb_free to be able to skip * empty groups without initialization */ if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { struct ext4_group_info ***group_info; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); kfree(group_info[idx]); group_info[idx] = NULL; rcu_read_unlock(); } return -ENOMEM; } /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; struct ext4_group_info ***group_info; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); if (err) return err; sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } /* To avoid potentially colliding with an valid on-disk inode number, * use EXT4_BAD_INO for the buddy cache inode number. This inode is * not in the inode hash, so it should never be found by iget(), but * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) goto err_freebuddy; } if (ext4_has_feature_flex_bg(sb)) { /* a single flex group is supposed to be read by a single IO. * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is * unsigned integer, so the maximum shift is 32. */ if (sbi->s_es->s_log_groups_per_flex >= 32) { ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); goto err_freebuddy; } sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { sbi->s_mb_prefetch = 32; } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); /* * now many real IOs to prefetch within a single allocation at * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related * optimization we shouldn't try to load too many groups, at some point * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); return 0; err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); if (grp) kmem_cache_free(cachep, grp); } i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); while (i-- > 0) kfree(group_info[i]); rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: rcu_read_lock(); kvfree(rcu_dereference(sbi->s_group_info)); rcu_read_unlock(); return -ENOMEM; } static void ext4_groupinfo_destroy_slabs(void) { int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } static int ext4_groupinfo_create_slab(size_t size) { static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); int slab_size; int blocksize_bits = order_base_2(size); int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep; if (cache_index >= NR_GRPINFO_CACHES) return -EINVAL; if (unlikely(cache_index < 0)) cache_index = 0; mutex_lock(&ext4_grpinfo_slab_create_mutex); if (ext4_groupinfo_caches[cache_index]) { mutex_unlock(&ext4_grpinfo_slab_create_mutex); return 0; /* Already created */ } slab_size = offsetof(struct ext4_group_info, bb_counters[blocksize_bits + 2]); cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); ext4_groupinfo_caches[cache_index] = cachep; mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } return 0; } static void ext4_discard_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_discard_work); struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); load_grp = UINT_MAX; list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { /* * If filesystem is umounting or no memory or suffering * from no space, give up the discard */ if ((sb->s_flags & SB_ACTIVE) && !err && !atomic_read(&sbi->s_retry_alloc_pending)) { grp = fd->efd_group; if (grp != load_grp) { if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); err = ext4_mb_load_buddy(sb, grp, &e4b); if (err) { kmem_cache_free(ext4_free_data_cachep, fd); load_grp = UINT_MAX; continue; } else { load_grp = grp; } } ext4_lock_group(sb, grp); ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, fd->efd_start_cluster + fd->efd_count - 1, 1); ext4_unlock_group(sb, grp); } kmem_cache_free(ext4_free_data_cachep, fd); } if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); } int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; unsigned offset, offset_incr; unsigned max; int ret; i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { ret = -ENOMEM; goto out; } i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; goto out; } ret = ext4_groupinfo_create_slab(sb->s_blocksize); if (ret < 0) goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; i = 1; offset = 0; offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; offset += offset_incr; offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } sbi->s_mb_avg_fragment_size_locks = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size_locks) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) { INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); } sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } sbi->s_mb_largest_free_orders_locks = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders_locks) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) { INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); } spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file * systems, this is probably too big (i.e, if the cluster size * is 1 megabyte, then group preallocation size becomes half a * gigabyte!). As a default, we will keep a two megabyte * group pralloc size for cluster sizes up to 64k, and after * that, we will force a minimum group preallocation size of * 32 clusters. This translates to 8 megs when the cluster * size is 256k, and 32 megs when the cluster size is 1 meg, * which seems reasonable as a default. */ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than * the s_mb_group_prealloc as determined above. We want * the preallocation size to be an exact multiple of the * RAID stripe size so that preallocations don't fragment * the stripes. */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) goto out_free_locality_groups; return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; return ret; } /* need to called with the ext4 group lock held */ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; int count = 0; list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; kmem_cache_free(ext4_pspace_cachep, pa); } return count; } void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; if (test_opt(sb, DISCARD)) { /* * wait the discard work to drain all of ext4_free_data */ flush_work(&sbi->s_discard_work); WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); } if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); if (!grinfo) continue; mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); count = ext4_mb_cleanup_pa(grinfo); if (count) mb_debug(sb, "mballoc: %d PAs left\n", count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); rcu_read_unlock(); } kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, "mballoc: %u generated and it took %llu", atomic_read(&sbi->s_mb_buddies_generated), atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); } static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + ext4_group_first_block_no(sb, block_group)); count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_free_data *entry) { struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); spin_lock(&EXT4_SB(sb)->s_md_lock); EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; spin_unlock(&EXT4_SB(sb)->s_md_lock); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. */ EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ folio_put(e4b.bd_buddy_folio); folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; LIST_HEAD(freed_data_list); struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); if (test_opt(sb, DISCARD)) { spin_lock(&sbi->s_md_lock); wake = list_empty(&sbi->s_discard_list); list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) queue_work(system_unbound_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); } } int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) goto out_ac_free; return 0; out_ac_free: kmem_cache_destroy(ext4_ac_cachep); out_pa_free: kmem_cache_destroy(ext4_pspace_cachep); out: return -ENOMEM; } void ext4_exit_mballoc(void) { /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); } #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 #define EXT4_MB_SYNC_UPDATE 0x0002 static int ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, ext4_group_t group, ext4_grpblk_t blkoff, ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; int err; unsigned int i, already, changed = len; KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, handle, sb, state, group, blkoff, len, flags, ret_changed); if (ret_changed) *ret_changed = 0; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) return PTR_ERR(bitmap_bh); if (handle) { BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (err) goto out_err; } err = -EIO; gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) goto out_err; if (handle) { BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; } ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); } if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { already = 0; for (i = 0; i < len; i++) if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == state) already++; changed = len - already; } if (state) { mb_set_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) - changed); } else { mb_clear_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) + changed); } ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); if (ret_changed) *ret_changed = changed; if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group); struct flex_groups *fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); if (state) atomic64_sub(changed, &fg->free_clusters); else atomic64_add(changed, &fg->free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); if (err) goto out_err; if (flags & EXT4_MB_SYNC_UPDATE) { sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); } out_err: brelse(bitmap_bh); return err; } /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { struct ext4_group_desc *gdp; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; int flags = 0; ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); sb = ac->ac_sb; sbi = EXT4_SB(sb); gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, 0, NULL); if (!err) err = -EFSCORRUPTED; return err; } #ifdef AGGRESSIVE_CHECK flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, flags, &changed); if (err && changed == 0) return err; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != ac->ac_b_ex.fe_len); #endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); return err; } /* * Idempotent helper for Ext4 fast commit replay path to set the state of * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; int err = 0; unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); /* * Check to see if we are freeing blocks across a group * boundary. * In case of flex_bg, this can happen that (block, len) may * span across more than one group. In that case we need to * get the corresponding group metadata to work with. * For this we have goto again loop. */ thisgrp_len = min_t(unsigned int, (unsigned int)len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); clen = EXT4_NUM_B2C(sbi, thisgrp_len); if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); break; } err = ext4_mb_mark_context(NULL, sb, state, group, blkoff, clen, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; BUG_ON(len < 0); } } /* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* * This function returns the next element to look at during inode * PA rbtree walk. We assume that we have held the inode PA rbtree lock * (ei->i_prealloc_lock) * * new_start The start of the range we want to compare * cur_start The existing start that we are comparing against * node The node of the rb_tree */ static inline struct rb_node* ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) { if (new_start < cur_start) return node->rb_left; else return node->rb_right; } static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; ext4_lblk_t tmp_pa_start; loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } read_unlock(&ei->i_prealloc_lock); } /* * Given an allocation context "ac" and a range "start", "end", check * and adjust boundaries if the range overlaps with any of the existing * preallocatoins stored in the corresponding inode of the allocation context. * * Parameters: * ac allocation context * start start of the new range * end end of the new range */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; /* * Adjust the normalized range so that it doesn't overlap with any * existing preallocated blocks(PAs). Make sure to hold the rbtree lock * so it doesn't change underneath us. */ read_lock(&ei->i_prealloc_lock); /* Step 1: find any one immediate neighboring PA of the normalized range */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || ac->ac_o_ex.fe_logical < tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } /* * Step 2: check if the found PA is left or right neighbor and * get the other neighbor */ if (tmp_pa) { if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { struct rb_node *tmp; left_pa = tmp_pa; tmp = rb_next(&left_pa->pa_node.inode_node); if (tmp) { right_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } else { struct rb_node *tmp; right_pa = tmp_pa; tmp = rb_prev(&right_pa->pa_node.inode_node); if (tmp) { left_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } } /* Step 3: get the non deleted neighbors */ if (left_pa) { for (iter = &left_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { left_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); left_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (right_pa) { for (iter = &right_pa->pa_node.inode_node;; iter = rb_next(iter)) { if (!iter) { right_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); right_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (left_pa) { left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } if (right_pa) { right_pa_start = right_pa->pa_lstart; BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); } /* Step 4: trim our normalized range to not overlap with the neighbors */ if (left_pa) { if (left_pa_end > new_start) new_start = left_pa_end; } if (right_pa) { if (right_pa_start < new_end) new_end = right_pa_start; } read_unlock(&ei->i_prealloc_lock); /* XXX: extra loop to check we really don't overlap preallocations */ ext4_mb_pa_assert_overlap(ac, new_start, new_end); *start = new_start; *end = new_end; } /* * Normalization means making request better in terms of * size and alignment */ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; /* do normalize only data requests, metadata requests do not need preallocation */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; /* sometime caller may want exact blocks */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; /* caller may indicate that preallocation isn't * required (it's a tail, for example) */ if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) return; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { ext4_mb_normalize_group_request(ac); return ; } bsbits = ac->ac_sb->s_blocksize_bits; /* first, let's learn actual file size * given current request is allocated */ size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); orig_size = size; /* max size of free chunks */ max = 2 << bsbits; #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ start_off = 0; if (size <= 16 * 1024) { size = 16 * 1024; } else if (size <= 32 * 1024) { size = 32 * 1024; } else if (size <= 64 * 1024) { size = 64 * 1024; } else if (size <= 128 * 1024) { size = 128 * 1024; } else if (size <= 256 * 1024) { size = 256 * 1024; } else if (size <= 512 * 1024) { size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; size = 2 * 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; /* * For tiny groups (smaller than 8MB) the chosen allocation * alignment may be larger than group size. Make sure the * alignment does not move allocation to a different group which * makes mballoc fail assertions later. */ start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); /* avoid unnecessary preallocation that may trigger assertions */ if (start + size > EXT_MAX_BLOCKS) size = EXT_MAX_BLOCKS - start; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; start = ar->lleft + 1; } if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; /* * Trim allocation request for filesystems with artificially small * groups. */ if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); end = start + size; ext4_mb_pa_adjust_overlap(ac, &start, &end); size = end - start; /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. * This normalization is done such that original request of * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and * "size" boundaries. * (Note fe_len can be relaxed since FS block allocation API does not * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) */ if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); BUG(); } BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && ar->pright >= size && ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } if (ar->pleft && (ar->lleft + 1 == start) && ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); for (int i=0; i<EXT4_MB_NUM_CRS; i++) { atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); } atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); /* did we allocate as much as normalizer originally wanted? */ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) atomic_inc(&sbi->s_bal_len_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) trace_ext4_mballoc_alloc(ac); else trace_ext4_mballoc_prealloc(ac); } /* * Called on failure; free up any blocks from the inode PA for this * context. We don't need this for MB_GROUP_PA because we only change * pa_free in ext4_mb_release_context(), but on failure, we've already * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. */ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; struct ext4_buddy e4b; int err; if (pa == NULL) { if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (WARN_RATELIMIT(err, "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) { spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } /* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* * use blocks preallocated to locality group */ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", pa->pa_lstart, len, pa); } /* * Return the prealloc space that have minimal distance * from the goal block. @cpa is the prealloc * space that is having currently known minimal distance * from the goal block. */ static struct ext4_prealloc_space * ext4_mb_check_group_pa(ext4_fsblk_t goal_block, struct ext4_prealloc_space *pa, struct ext4_prealloc_space *cpa) { ext4_fsblk_t cur_distance, new_distance; if (cpa == NULL) { atomic_inc(&pa->pa_count); return pa; } cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ atomic_dec(&cpa->pa_count); atomic_inc(&pa->pa_count); return pa; } /* * check if found pa meets EXT4_MB_HINT_GOAL_ONLY */ static bool ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) return true; /* * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted * in ext4_mb_normalize_request and will keep same with ac_o_ex * from ext4_mb_initialize_context. Choose ac_g_ex here to keep * consistent with ext4_mb_find_by_goal. */ start = pa->pa_pstart + (ac->ac_g_ex.fe_logical - pa->pa_lstart); if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) return false; if (ac->ac_g_ex.fe_len > pa->pa_len - EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) return false; return true; } /* * search goal blocks in preallocated space */ static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; /* * first, try per-file preallocation by searching the inode pa rbtree. * * Here, we can't do a direct traversal of the tree because * ext4_mb_discard_group_preallocation() can paralelly mark the pa * deleted and that can cause direct traversal to skip some entries. */ read_lock(&ei->i_prealloc_lock); if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { goto try_group_pa; } /* * Step 1: Find a pa with logical start immediately adjacent to the * original logical start. This could be on the left or right. * * (tmp_pa->pa_lstart never changes so we can skip locking for it). */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); } /* * Step 2: The adjacent pa might be to the right of logical start, find * the left adjacent pa. After this step we'd have a valid tmp_pa whose * logical start is towards the left of original request's logical start */ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { struct rb_node *tmp; tmp = rb_prev(&tmp_pa->pa_node.inode_node); if (tmp) { tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } else { /* * If there is no adjacent pa to the left then finding * an overlapping pa is not possible hence stop searching * inode pa tree */ goto try_group_pa; } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); /* * Step 3: If the left adjacent pa is deleted, keep moving left to find * the first non deleted adjacent pa. After this step we should have a * valid tmp_pa which is guaranteed to be non deleted. */ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { /* * no non deleted left adjacent pa, so stop searching * inode pa tree */ goto try_group_pa; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { /* * We will keep holding the pa_lock from * this point on because we don't want group discard * to delete this pa underneath us. Since group * discard is anyways an ENOSPC operation it * should be okay for it to wait a few more cycles. */ break; } else { spin_unlock(&tmp_pa->pa_lock); } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); BUG_ON(tmp_pa->pa_deleted == 1); /* * Step 4: We now have the non deleted left adjacent pa. Only this * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) { /* * Since PAs don't overlap, we won't find any other PA to * satisfy this. */ spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); read_unlock(&ei->i_prealloc_lock); return true; } else { /* * We found a valid overlapping pa but couldn't use it because * it had no free blocks. This should ideally never happen * because: * * 1. When a new inode pa is added to rbtree it must have * pa_free > 0 since otherwise we won't actually need * preallocation. * * 2. An inode pa that is in the rbtree can only have it's * pa_free become zero when another thread calls: * ext4_mb_new_blocks * ext4_mb_use_preallocated * ext4_mb_use_inode_pa * * 3. Further, after the above calls make pa_free == 0, we will * immediately remove it from the rbtree in: * ext4_mb_new_blocks * ext4_mb_release_context * ext4_mb_put_pa * * 4. Since the pa_free becoming 0 and pa_free getting removed * from tree both happen in ext4_mb_new_blocks, which is always * called with i_data_sem held for data allocations, we can be * sure that another process will never see a pa in rbtree with * pa_free == 0. */ WARN_ON_ONCE(tmp_pa->pa_free == 0); } spin_unlock(&tmp_pa->pa_lock); try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], pa_node.lg_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, tmp_pa, cpa); } spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } if (cpa) { ext4_mb_use_group_pa(ac, cpa); return true; } return false; } /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ static noinline_for_stack void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_prealloc_space *pa; struct list_head *cur; ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; int len; if (!grp) return; /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here * notice we do NOT ignore preallocations with pa_deleted * otherwise we could leave used blocks available for * allocation in buddy when concurrent ext4_mb_put_pa() * is dropping preallocation */ list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); len = pa->pa_len; spin_unlock(&pa->pa_lock); if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); mb_set_bits(bitmap, start, len); preallocated += len; } mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_mark_pa_deleted(struct super_block *sb, struct ext4_prealloc_space *pa) { struct ext4_inode_info *ei; if (pa->pa_deleted) { ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", pa->pa_type, pa->pa_pstart, pa->pa_lstart, pa->pa_len); return; } pa->pa_deleted = 1; if (pa->pa_type == MB_INODE_PA) { ei = EXT4_I(pa->pa_inode); atomic_dec(&ei->i_prealloc_active); } } static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); ext4_mb_pa_free(pa); } /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed */ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; ext4_fsblk_t grp_blk; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { spin_unlock(&pa->pa_lock); return; } if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; } ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) grp_blk--; grp = ext4_get_group_number(sb, grp_blk); /* * possible race: * * P1 (buddy init) P2 (regular allocation) * find block B in PA * copy on-disk bitmap to buddy * mark B in on-disk bitmap * drop PA from group * mark all PAs in buddy * * thus, P1 initializes buddy with B available. to prevent this * we make "copy" and "mark all PAs" atomic and serialize "drop PA" * against that pair */ ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); if (pa->pa_type == MB_INODE_PA) { write_lock(pa->pa_node_lock.inode_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); ext4_mb_pa_free(pa); } else { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) { struct rb_node **iter = &root->rb_node, *parent = NULL; struct ext4_prealloc_space *iter_pa, *new_pa; ext4_lblk_t iter_start, new_start; while (*iter) { iter_pa = rb_entry(*iter, struct ext4_prealloc_space, pa_node.inode_node); new_pa = rb_entry(new, struct ext4_prealloc_space, pa_node.inode_node); iter_start = iter_pa->pa_lstart; new_start = new_pa->pa_lstart; parent = *iter; if (new_start < iter_start) iter = &((*iter)->rb_left); else iter = &((*iter)->rb_right); } rb_link_node(new, parent, iter); rb_insert_color(new, root); } /* * creates new preallocated space for given inode */ static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { struct ext4_free_extent ex = { .fe_logical = ac->ac_g_ex.fe_logical, .fe_len = ac->ac_orig_goal_len, }; loff_t orig_goal_end = extent_logical_end(sbi, &ex); loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); /* * We can't allocate as much as normalizer wants, so we try * to get proper lstart to cover the original request, except * when the goal doesn't cover the original request as below: * * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 * best_ex:0/200(200) -> adjusted: 1848/2048(200) */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); /* * Use the below logic for adjusting best extent as it keeps * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original end * 3. Else, keep the best ex at start of original request. */ ex.fe_len = ac->ac_b_ex.fe_len; ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; ex.fe_logical = ac->ac_g_ex.fe_logical; if (o_ex_end <= extent_logical_end(sbi, &ex)) goto adjust_bex; ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); write_lock(pa->pa_node_lock.inode_lock); ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } /* * creates new preallocated space for locality group inodes belongs to */ static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; struct ext4_group_info *grp; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; lg = ac->ac_lg; BUG_ON(lg == NULL); pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ } static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) ext4_mb_new_group_pa(ac); else ext4_mb_new_inode_pa(ac); } /* * finds all unused blocks in on-disk bitmap, frees them in * in-core bitmap and buddy. * @pa must be unlinked from inode and group lists, so that * nobody else can find/use it. * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int end; unsigned int next; ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } atomic_add(free, &sbi->s_mb_discarded); } static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); } /* * releases all preallocations in given group * * first, we need to decide discard policy: * - when do we discard * 1) ENOSPC * - how many do we discard * 1) how many requested */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int *busy) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; int free = 0; if (!grp) return 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); goto out_dbg; } ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); *busy = 1; continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); if (!free) this_cpu_inc(discard_pa_seq); /* we can trust pa_free ... */ free += pa->pa_free; spin_unlock(&pa->pa_lock); list_del(&pa->pa_group_list); list_add(&pa->u.pa_tmp_list, &list); } /* now free all selected PAs */ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ if (pa->pa_type == MB_GROUP_PA) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { write_lock(pa->pa_node_lock.inode_lock); ei = EXT4_I(pa->pa_inode); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); } list_del(&pa->u.pa_tmp_list); if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_mb_pa_free(pa); } } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", free, group, grp->bb_free); return free; } /* * releases all non-used preallocated blocks for given inode * * It's important to discard preallocations under i_data_sem * We don't want another block to be served from the prealloc * space when we are discarding the inode prealloc space. * * FIXME!! Make sure it is valid at all the call sites */ void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) return; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; } if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); continue; } /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from * the list. as we might be called from * ->clear_inode() the inode will get freed * and concurrent thread which is unlinking * pa from inode's list may access already * freed memory, bad-bad-bad */ /* XXX: if this happens too often, we can * add a flag to force wait only in case * of ->clear_inode(), but not in case of * regular truncate */ schedule_timeout_uninterruptible(HZ); goto repeat; } write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); ext4_mb_pa_free(pa); } } static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa; BUG_ON(ext4_pspace_cachep == NULL); pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); if (!pa) return -ENOMEM; atomic_set(&pa->pa_count, 1); ac->ac_pa = pa; return 0; } static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); /* * current function is only called due to an error or due to * len of found blocks < len of requested blocks hence the PA has not * been added to grp->bb_prealloc_list. So we don't need to lock it */ pa->pa_deleted = 1; ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); mb_debug(sb, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; if (!grp) continue; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); mb_debug(sb, "PA:%u:%d:%d\n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, grp->bb_fragments); } } static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" " Allocation context details:"); mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); mb_debug(sb, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, (unsigned long)ac->ac_o_ex.fe_logical, (unsigned long)ac->ac_g_ex.fe_group, (unsigned long)ac->ac_g_ex.fe_start, (unsigned long)ac->ac_g_ex.fe_len, (unsigned long)ac->ac_g_ex.fe_logical, (unsigned long)ac->ac_b_ex.fe_group, (unsigned long)ac->ac_b_ex.fe_start, (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else static inline void ext4_mb_show_pa(struct super_block *sb) { } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); } #endif /* * We use locality group preallocation for small size file. The size of the * file is determined by the current size or the resulting size after * allocation which ever is larger * * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && !inode_is_open_for_write(ac->ac_inode)) inode_pa_eligible = false; size = max(size, isize); /* Don't use group allocation for large files */ if (size > sbi->s_mb_stream_request) group_pa_eligible = false; if (!group_pa_eligible) { if (inode_pa_eligible) ac->ac_flags |= EXT4_MB_STREAM_ALLOC; else ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } BUG_ON(ac->ac_lg != NULL); /* * locality group prealloc space are per cpu. The reason for having * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; /* serialize all allocations in the group */ mutex_lock(&ac->ac_lg->lg_mutex); } static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; unsigned int len; ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); } static noinline_for_stack void ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_locality_group *lg, int order, int total_entries) { ext4_group_t group = 0; struct ext4_buddy e4b; LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* * This is the pa that we just used * for block allocation. So don't * free that */ spin_unlock(&pa->pa_lock); continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* only lg prealloc space */ BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; if (total_entries <= 5) { /* * we want to keep only 5 entries * allowing it to grow to 8. This * mak sure we don't call discard * soon for this list. */ break; } } spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { int err; group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } /* * We have incremented pa_count. So it cannot be freed at this * point. Also we hold lg_mutex. So no parallel allocation is * possible from this lg. That means pa_free cannot be updated. * * A parallel ext4_mb_discard_group_preallocations is possible. * which can cause the lg_prealloc_list to be updated. */ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) { int order, added = 0, lg_prealloc_count = 1; struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; order = fls(pa->pa_free) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ list_add_tail_rcu(&pa->pa_node.lg_list, &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total * number of entries in the list */ } spin_unlock(&tmp_pa->pa_lock); lg_prealloc_count++; } if (!added) list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); } /* * release all resource we used in allocation */ static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding * doesn't grow big. */ if (likely(pa->pa_free)) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_folio) folio_put(ac->ac_bitmap_folio); if (ac->ac_buddy_folio) folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0, busy = 0; int retry = 0; trace_ext4_mb_discard_preallocations(sb, needed); if (needed == 0) needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; repeat: for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, &busy); freed += ret; needed -= ret; cond_resched(); } if (needed > 0 && busy && ++retry < 3) { busy = 0; goto repeat; } return freed; } static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, struct ext4_allocation_context *ac, u64 *seq) { int freed; u64 seq_retry = 0; bool ret = false; freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) { ret = true; goto out_dbg; } seq_retry = ext4_get_discard_pa_seq_sum(); if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { ac->ac_flags |= EXT4_MB_STRICT_CHECK; *seq = seq_retry; ret = true; } out_dbg: mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } /* * Simple allocator for Ext4 fast commit replay path. It searches for blocks * linearly starting at the goal block and also excludes the blocks which * are going to be in use after fast commit replay. */ static ext4_fsblk_t ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) { struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group, nr; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ar->len = 0; ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { *errp = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return 0; } while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); if (i >= max) break; if (ext4_fc_replay_check_excluded(sb, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i))) { blkoff = i + 1; } else break; } brelse(bitmap_bh); if (i < max) break; if (++group >= ext4_get_groups_count(sb)) group = 0; blkoff = 0; } if (i >= max) { *errp = -ENOSPC; return 0; } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; *errp = 0; return block; } /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; int retries = 0; u64 seq; might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ while (ar->len && ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { dquot_alloc_block_nofail(ar->inode, EXT4_C2B(sbi, ar->len)); } else { while (ar->len && dquot_alloc_block(ar->inode, EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } } inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; goto out; } } ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out; } ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); *errp = ext4_mb_pa_alloc(ac); if (*errp) goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); /* * pa allocated above is added to grp->bb_prealloc_list only * when we were able to allocate some block i.e. when * ac->ac_status == AC_STATUS_FOUND. * And error from above mean ac->ac_status != AC_STATUS_FOUND * So we have to free this pa here itself. */ if (*errp) { ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } } else { if (++retries < 3 && ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above * needs to be freed here itself. */ ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } if (*errp) { errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); kmem_cache_free(ext4_ac_cachep, ac); out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } /* * We can merge two free data extents only if the physical blocks * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, struct ext4_free_data *entry, struct ext4_free_data *new_entry, struct rb_root *entry_rb_root) { if ((entry->efd_tid != new_entry->efd_tid) || (entry->efd_group != new_entry->efd_group)) return; if (entry->efd_start_cluster + entry->efd_count == new_entry->efd_start_cluster) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; } else if (new_entry->efd_start_cluster + new_entry->efd_count == entry->efd_start_cluster) { new_entry->efd_count += entry->efd_count; } else return; spin_lock(&sbi->s_md_lock); list_del(&entry->efd_list); spin_unlock(&sbi->s_md_lock); rb_erase(&entry->efd_node, entry_rb_root); kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_folio == NULL); BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ folio_get(e4b->bd_buddy_folio); folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, efd_node); if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); return; } } rb_link_node(new_node, parent, n); rb_insert_color(new_node, &db->bb_free_root); /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); ext4_try_merge_freed_extent(sbi, entry, new_entry, &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); ext4_try_merge_freed_extent(sbi, entry, new_entry, &(db->bb_free_root)); } spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { struct super_block *sb = inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); } /** * ext4_mb_clear_bb() -- helper function for freeing blocks. * Used by ext4_free_blocks() * @handle: handle for this transaction * @inode: inode * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int mark_flags = 0; ext4_grpblk_t changed; sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); grp = ext4_get_group_info(sb, block_group); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return; /* * Check to see if we are freeing blocks across a group * boundary. */ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_clean; } #ifdef AGGRESSIVE_CHECK mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, false, block_group, bit, count_clusters, mark_flags, &changed); if (err && changed == 0) goto error_clean; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != count_clusters); #endif /* * We need to make sure we don't reuse the freed block until after the * transaction is committed. We make an exception if the inode is to be * written in writeback mode since writeback mode has weak data * consistency guarantees. */ if (ext4_handle_valid(handle) && ((flags & EXT4_FREE_BLOCKS_METADATA) || !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count_clusters); /* * Ignore EOPNOTSUPP error. This is consistent with * what happens when using journal. */ if (err == -EOPNOTSUPP) err = 0; if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); } EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_free_blocks(inode, &e4b, bit, count_clusters); } ext4_unlock_group(sb, block_group); /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they * can determine if a cluster freed here should be rereserved */ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); } if (overflow && !err) { block += count; count = overflow; ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); } /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction * @inode: inode * @bh: optional buffer of the block to be freed * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; unsigned int overflow; struct ext4_sb_info *sbi; sbi = EXT4_SB(sb); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); else block = bh->b_blocknr; } if (sbi->s_mount_state & EXT4_FC_REPLAY) { ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); return; } might_sleep(); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); return; } flags |= EXT4_FREE_BLOCKS_VALIDATED; ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { BUG_ON(count > 1); ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block); } /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; block += overflow; if (count > overflow) count -= overflow; else return; } else { block -= overflow; count += overflow; } /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) count -= overflow; else return; } else count += sbi->s_cluster_ratio - overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) bh = sb_find_get_block_nonatomic(inode->i_sb, block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } ext4_mb_clear_bb(handle, inode, block, count, flags); } /** * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. */ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { ext4_group_t block_group; ext4_grpblk_t bit; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_out; } err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; goto error_clean; } err = ext4_mb_mark_context(handle, sb, false, block_group, bit, cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, &changed); if (err && changed == 0) goto error_clean; if (changed != cluster_count) ext4_error(sb, "bit already cleared in group %u", block_group); ext4_lock_group(sb, block_group); mb_free_blocks(NULL, &e4b, bit, cluster_count); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, changed); error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); return err; } /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ static int ext4_trim_extent(struct super_block *sb, int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; /* * Mark blocks used, so no one can reuse them while * being trimmed. */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, ext4_group_t grp) { unsigned long nr_clusters_in_group; if (grp < (ext4_get_groups_count(sb) - 1)) nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); else nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, grp)) >> EXT4_CLUSTER_BITS(sb); return nr_clusters_in_group - 1; } static bool ext4_trim_interrupted(void) { return fatal_signal_pending(current) || freezing(current); } static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count, last, origin_start; bool set_trimmed = false; void *bitmap; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return 0; last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) set_trimmed = true; origin_start = start; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; next = mb_find_next_bit(bitmap, last + 1, start); if (origin_start == 0 && next >= last) set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) return count; count += next - start; } free_count += next - start; start = next + 1; if (ext4_trim_interrupted()) return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); cond_resched(); ext4_lock_group(sb, e4b->bd_group); } if ((e4b->bd_info->bb_free - free_count) < minblocks) break; } if (set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); return count; } /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; trace_ext4_trim_all_free(sb, group, start, max); ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_warning(sb, "Error %d loading buddy information for %u", ret, group); return ret; } ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); else ret = 0; ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", ret, group); return ret; } /** * ext4_trim_fs() -- trim ioctl handle function * @sb: superblock for filesystem * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * ext4_trim_fs goes through all allocation groups containing Bytes from * start to start+len. For each such a group ext4_trim_all_free function * is invoked to trim all free space. */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = EXT4_NUM_B2C(EXT4_SB(sb), range->minlen >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } if (end >= max_blks - 1) end = max_blks - 1; if (end <= first_data_blk) goto out; if (start < first_data_blk) start = first_data_blk; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { if (ext4_trim_interrupted()) break; grp = ext4_get_group_info(sb, group); if (!grp) continue; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } /* * For all the groups except the last one, last cluster will * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ if (group == last_group) end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, end, minlen); if (cnt < 0) { ret = cnt; break; } trimmed += cnt; } /* * For every group except the first one, we are sure * that the first cluster to discard will be cluster #0. */ first_cluster = 0; } if (!ret) EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } /* Iterate all the free extents in the group. */ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, ext4_grpblk_t first, ext4_grpblk_t end, ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; error = ext4_mb_load_buddy(sb, group, &e4b); if (error) return error; bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; if (meta_formatter && start != first) { if (start > end) start = end; ext4_unlock_group(sb, group); error = meta_formatter(sb, group, first, start - first, priv); if (error) goto out_unload; ext4_lock_group(sb, group); } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) break; next = mb_find_next_bit(bitmap, end + 1, start); ext4_unlock_group(sb, group); error = formatter(sb, group, start, next - start, priv); if (error) goto out_unload; ext4_lock_group(sb, group); start = next + 1; } ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(&e4b); return error; } #ifdef CONFIG_EXT4_KUNIT_TESTS #include "mballoc-test.c" #endif
7 5 2 7 7 2 5 5 4 1 1 2 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * sch_plug.c Queue traffic until an explicit release command * * There are two ways to use this qdisc: * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. * * 2. For network output buffering (a.k.a output commit) functionality. * Output commit property is commonly used by applications using checkpoint * based fault-tolerance to ensure that the checkpoint from which a system * is being restored is consistent w.r.t outside world. * * Consider for e.g. Remus - a Virtual Machine checkpointing system, * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated * asynchronously to the backup host, while the VM continues executing the * next epoch speculatively. * * The following is a typical sequence of output buffer operations: * 1.At epoch i, start_buffer(i) * 2. At end of epoch i (i.e. after 50ms): * 2.1 Stop VM and take checkpoint(i). * 2.2 start_buffer(i+1) and Resume VM * 3. While speculatively executing epoch(i+1), asynchronously replicate * checkpoint(i) to backup host. * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) * Thus, this Qdisc would receive the following sequence of commands: * TCQ_PLUG_BUFFER (epoch i) * .. TCQ_PLUG_BUFFER (epoch i+1) * ....TCQ_PLUG_RELEASE_ONE (epoch i) * ......TCQ_PLUG_BUFFER (epoch i+2) * ........ */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> /* * State of the queue, when used for network output buffering: * * plug(i+1) plug(i) head * ------------------+--------------------+----------------> * | | * | | * pkts_current_epoch| pkts_last_epoch |pkts_to_release * ----------------->|<--------+--------->|+---------------> * v v * */ struct plug_sched_data { /* If true, the dequeue function releases all packets * from head to end of the queue. The queue turns into * a pass-through queue for newly arriving packets. */ bool unplug_indefinite; bool throttled; /* Queue Limit in bytes */ u32 limit; /* Number of packets (output) from the current speculatively * executing epoch. */ u32 pkts_current_epoch; /* Number of packets corresponding to the recently finished * epoch. These will be released when we receive a * TCQ_PLUG_RELEASE_ONE command. This command is typically * issued after committing a checkpoint at the target. */ u32 pkts_last_epoch; /* * Number of packets from the head of the queue, that can * be released (committed checkpoint). */ u32 pkts_to_release; }; static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); if (likely(sch->qstats.backlog + skb->len <= q->limit)) { if (!q->unplug_indefinite) q->pkts_current_epoch++; return qdisc_enqueue_tail(skb, sch); } return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) { struct plug_sched_data *q = qdisc_priv(sch); if (q->throttled) return NULL; if (!q->unplug_indefinite) { if (!q->pkts_to_release) { /* No more packets to dequeue. Block the queue * and wait for the next release command. */ q->throttled = true; return NULL; } q->pkts_to_release--; } return qdisc_dequeue_head(sch); } static int plug_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); q->pkts_current_epoch = 0; q->pkts_last_epoch = 0; q->pkts_to_release = 0; q->unplug_indefinite = false; if (opt == NULL) { q->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; q->limit = ctl->limit; } q->throttled = true; return 0; } /* Receives 4 types of messages: * TCQ_PLUG_BUFFER: Inset a plug into the queue and * buffer any incoming packets * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head * to beginning of the next plug. * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. * Stop buffering packets until the next TCQ_PLUG_BUFFER * command is received (just act as a pass-thru queue). * TCQ_PLUG_LIMIT: Increase/decrease queue size */ static int plug_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); struct tc_plug_qopt *msg; msg = nla_data(opt); if (nla_len(opt) < sizeof(*msg)) return -EINVAL; switch (msg->action) { case TCQ_PLUG_BUFFER: /* Save size of the current buffer */ q->pkts_last_epoch = q->pkts_current_epoch; q->pkts_current_epoch = 0; if (q->unplug_indefinite) q->throttled = true; q->unplug_indefinite = false; break; case TCQ_PLUG_RELEASE_ONE: /* Add packets from the last complete buffer to the * packets to be released set. */ q->pkts_to_release += q->pkts_last_epoch; q->pkts_last_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_RELEASE_INDEFINITE: q->unplug_indefinite = true; q->pkts_to_release = 0; q->pkts_last_epoch = 0; q->pkts_current_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_LIMIT: /* Limit is supplied in bytes */ q->limit = msg->limit; break; default: return -EINVAL; } return 0; } static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .id = "plug", .priv_size = sizeof(struct plug_sched_data), .enqueue = plug_enqueue, .dequeue = plug_dequeue, .peek = qdisc_peek_dequeued, .init = plug_init, .change = plug_change, .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("plug"); static int __init plug_module_init(void) { return register_qdisc(&plug_qdisc_ops); } static void __exit plug_module_exit(void) { unregister_qdisc(&plug_qdisc_ops); } module_init(plug_module_init) module_exit(plug_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Qdisc to plug and unplug traffic via netlink control");
25 25 7 39 25 25 33 33 33 33 32 9 4 9 9 8 3 8 1 2 3 3 37 11 38 1 2 2 35 33 1 3 2 1 1 4 3 1 3 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> */ #define pr_fmt(fmt) "PM: hibernation: " fmt #include <crypto/acompress.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/async.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/pm.h> #include <linux/nmi.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/ktime.h> #include <linux/security.h> #include <linux/secretmem.h> #include <trace/events/power.h> #include "power.h" static int nocompress; static int noresume; static int nohibernate; static int resume_wait; static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP; /* * Compression/decompression algorithm to be used while saving/loading * image to/from disk. This would later be used in 'kernel/power/swap.c' * to allocate comp streams. */ char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, #ifdef CONFIG_SUSPEND HIBERNATION_SUSPEND, #endif HIBERNATION_TEST_RESUME, /* keep last */ __HIBERNATION_AFTER_LAST }; #define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) #define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); } void hibernate_release(void) { atomic_inc(&hibernate_atomic); } bool hibernation_in_progress(void) { return !atomic_read(&hibernate_atomic); } bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION) && !secretmem_active() && !cxl_mem_active(); } /** * hibernation_set_ops - Set the global hibernate operations. * @ops: Hibernation operations to use in subsequent hibernation transitions. */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { unsigned int sleep_flags; if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } sleep_flags = lock_system_sleep(); hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(hibernation_set_ops); static bool entering_platform_hibernation; bool system_entering_hibernation(void) { return entering_platform_hibernation; } EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG static unsigned int pm_test_delay = 5; module_param(pm_test_delay, uint, 0644); MODULE_PARM_DESC(pm_test_delay, "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { pr_info("hibernation debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) { if (pm_test_level == level) { hibernation_debug_sleep(); return 1; } return 0; } #else /* !CONFIG_PM_DEBUG */ static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ /** * platform_begin - Call platform to start hibernation. * @platform_mode: Whether or not to use the platform driver. */ static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->begin(PMSG_FREEZE) : 0; } /** * platform_end - Call platform to finish transition to the working state. * @platform_mode: Whether or not to use the platform driver. */ static void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->end(); } /** * platform_pre_snapshot - Call platform to prepare the machine for hibernation. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to prepare the system for creating a hibernate image, * if so configured, and return an error code if that fails. */ static int platform_pre_snapshot(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_snapshot() : 0; } /** * platform_leave - Call platform to prepare a transition to the working state. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver prepare to prepare the machine for switching to the * normal mode of operation. * * This routine is called on one CPU with interrupts disabled. */ static void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->leave(); } /** * platform_finish - Call platform to switch the system to the working state. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to switch the machine to the normal mode of * operation. * * This routine must be called after platform_prepare(). */ static void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->finish(); } /** * platform_pre_restore - Prepare for hibernate image restoration. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to prepare the system for resume from a hibernation * image. * * If the restore fails after this function has been called, * platform_restore_cleanup() must be called. */ static int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_restore() : 0; } /** * platform_restore_cleanup - Switch to the working state after failing restore. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to switch the system to the normal mode of operation * after a failing restore. * * If platform_pre_restore() has been called before the failing restore, this * function must be called too, regardless of the result of * platform_pre_restore(). */ static void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->restore_cleanup(); } /** * platform_recover - Recover from a failure to suspend devices. * @platform_mode: Whether or not to use the platform driver. */ static void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) hibernation_ops->recover(); } /** * swsusp_show_speed - Print time elapsed between two events during hibernation. * @start: Starting event. * @stop: Final event. * @nr_pages: Number of memory pages processed between @start and @stop. * @msg: Additional diagnostic message to print. */ void swsusp_show_speed(ktime_t start, ktime_t stop, unsigned nr_pages, char *msg) { ktime_t diff; u64 elapsed_centisecs64; unsigned int centisecs; unsigned int k; unsigned int kps; diff = ktime_sub(stop, start); elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", msg, k, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); } __weak int arch_resume_nosmt(void) { return 0; } /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * * Execute device drivers' "late" and "noirq" freeze callbacks, create a * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ static int create_image(int platform_mode) { int error; error = dpm_suspend_end(PMSG_FREEZE); if (error) { pr_err("Some devices failed to power down, aborting\n"); return error; } error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; error = pm_sleep_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) { pr_err("Some system devices failed to power down, aborting\n"); goto Enable_irqs; } if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) goto Power_up; in_suspend = 1; save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); /* Restore control flow magically appears here */ restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) pr_err("Error %d creating image\n", error); if (!in_suspend) { events_check_enabled = false; clear_or_poison_free_pages(); } platform_leave(platform_mode); Power_up: syscore_resume(); Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: pm_sleep_enable_secondary_cpus(); /* Allow architectures to do nosmt-specific post-resume dances */ if (!in_suspend) error = arch_resume_nosmt(); Platform_finish: platform_finish(platform_mode); dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; } /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with system_transition_mutex held. */ int hibernation_snapshot(int platform_mode) { pm_message_t msg; int error; pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) goto Close; error = freeze_kernel_threads(); if (error) goto Cleanup; if (hibernation_test(TEST_FREEZER)) { /* * Indicate to the caller that we are returning due to a * successful freezer test. */ freezer_test_done = true; goto Thaw; } error = dpm_prepare(PMSG_FREEZE); if (error) { dpm_complete(PMSG_RECOVER); goto Thaw; } console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); if (error || hibernation_test(TEST_DEVICES)) platform_recover(platform_mode); else error = create_image(platform_mode); /* * In the case that we call create_image() above, the control * returns here (1) after the image has been created or the * image creation has failed and (2) after a successful restore. */ /* We may need to release the preallocated image pages here. */ if (error || !in_suspend) swsusp_free(); msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; dpm_resume(msg); if (error || !in_suspend) pm_restore_gfp_mask(); console_resume_all(); dpm_complete(msg); Close: platform_end(platform_mode); return error; Thaw: thaw_kernel_threads(); Cleanup: swsusp_free(); goto Close; } int __weak hibernate_resume_nonboot_cpu_disable(void) { return suspend_disable_secondary_cpus(); } /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * * Execute device drivers' "noirq" and "late" freeze callbacks, restore the * contents of highmem that have not been restored yet from the image and run * the low-level code that will restore the remaining contents of memory and * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; error = dpm_suspend_end(PMSG_QUIESCE); if (error) { pr_err("Some devices failed to power down, aborting resume\n"); return error; } error = platform_pre_restore(platform_mode); if (error) goto Cleanup; cpuidle_pause(); error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) goto Enable_irqs; save_processor_state(); error = restore_highmem(); if (!error) { error = swsusp_arch_resume(); /* * The code below is only ever reached in case of a failure. * Otherwise, execution continues at the place where * swsusp_arch_suspend() was called. */ BUG_ON(!error); /* * This call to restore_highmem() reverts the changes made by * the previous one. */ restore_highmem(); } /* * The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid * subsequent failures. */ swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); syscore_resume(); Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: pm_sleep_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); dpm_resume_start(PMSG_RECOVER); return error; } /** * hibernation_restore - Quiesce devices and restore from a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with system_transition_mutex held. If it is * successful, control reappears in the restored target kernel in * hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { int error; pm_prepare_console(); console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); /* * The above should either succeed and jump to the new kernel, * or return with an error. Otherwise things are just * undefined, so let's be paranoid. */ BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); console_resume_all(); pm_restore_console(); return error; } /** * hibernation_platform_enter - Power off the system using the platform driver. */ int hibernation_platform_enter(void) { int error; if (!hibernation_ops) return -ENOSYS; /* * We have cancelled the power transition by running * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ error = hibernation_ops->begin(PMSG_HIBERNATE); if (error) goto Close; entering_platform_hibernation = true; console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) hibernation_ops->recover(); goto Resume_devices; } error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; error = hibernation_ops->prepare(); if (error) goto Platform_finish; error = pm_sleep_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) goto Enable_irqs; if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; } hibernation_ops->enter(); /* We should never get here */ while (1); Power_up: syscore_resume(); Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: pm_sleep_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); console_resume_all(); Close: hibernation_ops->end(); return error; } /** * power_down - Shut the machine down for hibernation. * * Use the platform driver, if configured, to put the system into the sleep * state corresponding to hibernation, or try to power it off or reboot, * depending on the value of hibernation_mode. */ static void power_down(void) { int error; #ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); if (error) { hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : HIBERNATION_SHUTDOWN; } else { /* Restore swap signature. */ error = swsusp_unmark(); if (error) pr_err("Swap will be unusable! Try swapon -a.\n"); return; } } #endif switch (hibernation_mode) { case HIBERNATION_REBOOT: kernel_restart(NULL); break; case HIBERNATION_PLATFORM: error = hibernation_platform_enter(); if (error == -EAGAIN || error == -EBUSY) { swsusp_unmark(); events_check_enabled = false; pr_info("Wakeup event detected during hibernation, rolling back.\n"); return; } fallthrough; case HIBERNATION_SHUTDOWN: if (kernel_can_power_off()) { entering_platform_hibernation = true; kernel_power_off(); entering_platform_hibernation = false; } break; } kernel_halt(); /* * Valid image is on the disk, if we continue we risk serious data * corruption after resume. */ pr_crit("Power down manually\n"); while (1) cpu_relax(); } static int load_image_and_restore(void) { int error; unsigned int flags; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { swsusp_close(); goto Unlock; } error = swsusp_read(&flags); swsusp_close(); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); pr_err("Failed to load image, recovering.\n"); swsusp_free(); free_basic_memory_bitmaps(); Unlock: unlock_device_hotplug(); return error; } #define COMPRESSION_ALGO_LZO "lzo" #define COMPRESSION_ALGO_LZ4 "lz4" /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { bool snapshot_test = false; unsigned int sleep_flags; int error; if (!hibernation_available()) { pm_pr_dbg("Hibernation not available.\n"); return -EPERM; } /* * Query for the compression algorithm support if compression is enabled. */ if (!nocompress) { strscpy(hib_comp_algo, hibernate_compressor); if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; } } sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } pr_info("hibernation entry\n"); pm_prepare_console(); error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); if (error) goto Restore; ksys_sync_helper(); if (filesystem_freeze_enabled) filesystems_freeze(); error = freeze_processes(); if (error) goto Exit; lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) goto Free_bitmaps; if (in_suspend) { unsigned int flags = 0; if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; if (nocompress) { flags |= SF_NOCOMPRESS_MODE; } else { flags |= SF_CRC32_MODE; /* * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4 * to override this behaviour and use LZ4. * * Refer kernel/power/power.h for more details */ if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4)) flags |= SF_COMPRESSION_ALG_LZ4; else flags |= SF_COMPRESSION_ALG_LZO; } pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); if (!error) { if (hibernation_mode == HIBERNATION_TEST_RESUME) snapshot_test = true; else power_down(); } in_suspend = 0; pm_restore_gfp_mask(); } else { pm_pr_dbg("Hibernation image restored successfully.\n"); } Free_bitmaps: free_basic_memory_bitmaps(); Thaw: unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); error = swsusp_check(false); if (!error) error = load_image_and_restore(); } thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); hibernate_release(); Unlock: unlock_system_sleep(sleep_flags); pr_info("hibernation exit\n"); return error; } /** * hibernate_quiet_exec - Execute a function with all devices frozen. * @func: Function to execute. * @data: Data pointer to pass to @func. * * Return the @func return value or an error code if it cannot be executed. */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { unsigned int sleep_flags; int error; sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; goto unlock; } pm_prepare_console(); error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); if (error) goto restore; if (filesystem_freeze_enabled) filesystems_freeze(); error = freeze_processes(); if (error) goto exit; lock_device_hotplug(); pm_suspend_clear_flags(); error = platform_begin(true); if (error) goto thaw; error = freeze_kernel_threads(); if (error) goto thaw; error = dpm_prepare(PMSG_FREEZE); if (error) goto dpm_complete; console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) goto dpm_resume; error = dpm_suspend_end(PMSG_FREEZE); if (error) goto dpm_resume; error = platform_pre_snapshot(true); if (error) goto skip; error = func(data); skip: platform_finish(true); dpm_resume_start(PMSG_THAW); dpm_resume: dpm_resume(PMSG_THAW); console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); thaw_kernel_threads(); thaw: platform_end(true); unlock_device_hotplug(); thaw_processes(); exit: filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: pm_restore_console(); hibernate_release(); unlock: unlock_system_sleep(sleep_flags); return error; } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); static int __init find_resume_device(void) { if (!strlen(resume_file)) return -ENOENT; pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); if (resume_delay) { pr_info("Waiting %dsec before reading resume device ...\n", resume_delay); ssleep(resume_delay); } /* Check if the device is there */ if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) return 0; /* * Some device discovery might still be in progress; we need to wait for * this to finish. */ wait_for_device_probe(); if (resume_wait) { while (early_lookup_bdev(resume_file, &swsusp_resume_device)) msleep(10); async_synchronize_full(); } return early_lookup_bdev(resume_file, &swsusp_resume_device); } static int software_resume(void) { int error; pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); mutex_lock(&system_transition_mutex); error = swsusp_check(true); if (error) goto Unlock; /* * Check if the hibernation image is compressed. If so, query for * the algorithm support. */ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); else strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; goto Unlock; } } /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(); goto Unlock; } pr_info("resume from hibernation\n"); pm_prepare_console(); error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (error) goto Restore; if (filesystem_freeze_enabled) filesystems_freeze(); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); if (error) { filesystems_thaw(); goto Close_Finish; } error = freeze_kernel_threads(); if (error) { thaw_processes(); filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: pm_restore_console(); pr_info("resume failed (%d)\n", error); hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: swsusp_close(); goto Finish; } /** * software_resume_initcall - Resume from a saved hibernation image. * * This routine is called as a late initcall, when all devices have been * discovered and initialized already. * * The image reading code is called to see if there is a hibernation image * available for reading. If that is the case, devices are quiesced and the * contents of memory is restored from the saved image. * * If this is successful, control reappears in the restored target kernel in * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ static int __init software_resume_initcall(void) { /* * If the user said "noresume".. bail out early. */ if (noresume || !hibernation_available()) return 0; if (!swsusp_resume_device) { int error = find_resume_device(); if (error) return error; } return software_resume(); } late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", #ifdef CONFIG_SUSPEND [HIBERNATION_SUSPEND] = "suspend", #endif [HIBERNATION_TEST_RESUME] = "test_resume", }; /* * /sys/power/disk - Control hibernation mode. * * Hibernation can be handled in several ways. There are a few different ways * to put the system into the sleep state: using the platform driver (e.g. ACPI * or other hibernation_ops), powering it off or rebooting it (for testing * mostly). * * The sysfs file /sys/power/disk provides an interface for selecting the * hibernation mode to use. Reading from this file causes the available modes * to be printed. There are 3 modes that can be supported: * * 'platform' * 'shutdown' * 'reboot' * * If a platform hibernation driver is in use, 'platform' will be supported * and will be used by default. Otherwise, 'shutdown' will be used by default. * The selected option (i.e. the one corresponding to the current value of * hibernation_mode) is enclosed by a square bracket. * * To select a given hibernation mode it is necessary to write the mode's * string representation (as returned by reading from /sys/power/disk) back * into /sys/power/disk. */ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; int i; if (!hibernation_available()) return sysfs_emit(buf, "[disabled]\n"); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) continue; switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif case HIBERNATION_TEST_RESUME: break; case HIBERNATION_PLATFORM: if (hibernation_ops) break; /* not a valid mode, continue with loop */ continue; } if (i == hibernation_mode) count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]); else count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]); } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int mode = HIBERNATION_INVALID; unsigned int sleep_flags; int error = 0; int len; char *p; int i; if (!hibernation_available()) return -EPERM; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { mode = i; break; } } if (mode != HIBERNATION_INVALID) { switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif case HIBERNATION_TEST_RESUME: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: if (hibernation_ops) hibernation_mode = mode; else error = -EINVAL; } } else error = -EINVAL; if (!error) pm_pr_dbg("Hibernation mode set to '%s'\n", hibernation_modes[mode]); unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; int len = n; char *name; dev_t dev; int error; if (!hibernation_available()) return n; if (len && buf[len-1] == '\n') len--; name = kstrndup(buf, len, GFP_KERNEL); if (!name) return -ENOMEM; error = lookup_bdev(name, &dev); if (error) { unsigned maj, min, offset; char *p, dummy; error = 0; if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { dev = MKDEV(maj, min); if (maj != MAJOR(dev) || min != MINOR(dev)) error = -EINVAL; } else { dev = new_decode_dev(simple_strtoul(name, &p, 16)); if (*p) error = -EINVAL; } } kfree(name); if (error) return error; sleep_flags = lock_system_sleep(); swsusp_resume_device = dev; unlock_system_sleep(sleep_flags); pm_pr_dbg("Configured hibernation resume from disk to %u\n", swsusp_resume_device); noresume = 0; software_resume(); return n; } power_attr(resume); static ssize_t resume_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block); } static ssize_t resume_offset_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long long offset; int rc; rc = kstrtoull(buf, 0, &offset); if (rc) return rc; swsusp_resume_block = offset; return n; } power_attr(resume_offset); static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long size; if (sscanf(buf, "%lu", &size) == 1) { image_size = size; return n; } return -EINVAL; } power_attr(image_size); static ssize_t reserved_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", reserved_size); } static ssize_t reserved_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long size; if (sscanf(buf, "%lu", &size) == 1) { reserved_size = size; return n; } return -EINVAL; } power_attr(reserved_size); static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, &image_size_attr.attr, &reserved_size_attr.attr, NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static int __init pm_disk_init(void) { return sysfs_create_group(power_kobj, &attr_group); } core_initcall(pm_disk_init); static int __init resume_setup(char *str) { if (noresume) return 1; strscpy(resume_file, str); return 1; } static int __init resume_offset_setup(char *str) { unsigned long long offset; if (noresume) return 1; if (sscanf(str, "%llu", &offset) == 1) swsusp_resume_block = offset; return 1; } static int __init hibernate_setup(char *str) { if (!strncmp(str, "noresume", 8)) { noresume = 1; } else if (!strncmp(str, "nocompress", 10)) { nocompress = 1; } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && !strncmp(str, "protect_image", 13)) { enable_restore_image_protection(); } return 1; } static int __init noresume_setup(char *str) { noresume = 1; return 1; } static int __init resumewait_setup(char *str) { resume_wait = 1; return 1; } static int __init resumedelay_setup(char *str) { int rc = kstrtouint(str, 0, &resume_delay); if (rc) pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } static int __init nohibernate_setup(char *str) { noresume = 1; nohibernate = 1; return 1; } static const char * const comp_alg_enabled[] = { #if IS_ENABLED(CONFIG_CRYPTO_LZO) COMPRESSION_ALGO_LZO, #endif #if IS_ENABLED(CONFIG_CRYPTO_LZ4) COMPRESSION_ALGO_LZ4, #endif }; static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { int index, ret; if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { ret = param_set_copystring(comp_alg_enabled[index], kp); if (!ret) strscpy(hib_comp_algo, comp_alg_enabled[index]); } else { ret = index; } mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", compressor); return ret; } static const struct kernel_param_ops hibernate_compressor_param_ops = { .set = hibernate_compressor_param_set, .get = param_get_string, }; static struct kparam_string hibernate_compressor_param_string = { .maxlen = sizeof(hibernate_compressor), .string = hibernate_compressor, }; module_param_cb(compressor, &hibernate_compressor_param_ops, &hibernate_compressor_param_string, 0644); MODULE_PARM_DESC(compressor, "Compression algorithm to be used with hibernation"); __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup);
1 37 66 65 65 1 65 1 64 19 19 19 19 19 19 19 19 2 2 21 397 398 5 393 397 144 64 79 80 80 79 18 79 80 80 80 80 79 501 502 19 1 18 1 80 12 69 67 2 69 27 53 302 3 298 418 21 397 386 14 398 2 397 395 3 3 3 1 18 9 37 37 37 37 37 35 37 24 14 23 9 43 2 3 25 26 12 10 4 6 30 36 2 397 64 15 3 13 397 397 6 1 1 381 12 387 388 6 6 426 27 402 403 214 214 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who, const void *freeze_owner); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static bool super_flags(const struct super_block *sb, unsigned int flags) { /* * Pairs with smp_store_release() in super_wake() and ensures * that we see @flags after we're woken. */ return smp_load_acquire(&sb->s_flags) & flags; } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: The function returns true if SB_BORN was set and with * s_umount held. The function returns false if SB_DYING was * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); /* Don't pointlessly acquire s_umount. */ if (super_flags(sb, SB_DYING)) return false; __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) { super_unlock(sb, excl); return false; } WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); return true; } /* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) goto fail; s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; s->s_shrink->batch = 1024; s->s_shrink->private_data = s; if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super(struct super_block *sb) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_excl(sb); if (locked) { if (atomic_inc_not_zero(&sb->s_active)) { put_super(sb); return true; } super_unlock_excl(sb); } wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL, "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; /* * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is * not set, as the filesystem is likely unprepared to handle it. * This can happen when fsconfig() is called from init_user_ns with * an fs_fd opened in another user namespace. */ if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); return ERR_PTR(-EPERM); } retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); shrinker_register(s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); enum super_iter_flags_t { SUPER_ITER_EXCL = (1U << 0), SUPER_ITER_UNLOCKED = (1U << 1), SUPER_ITER_REVERSE = (1U << 2), }; static inline struct super_block *first_super(enum super_iter_flags_t flags) { if (flags & SUPER_ITER_REVERSE) return list_last_entry(&super_blocks, struct super_block, s_list); return list_first_entry(&super_blocks, struct super_block, s_list); } static inline struct super_block *next_super(struct super_block *sb, enum super_iter_flags_t flags) { if (flags & SUPER_ITER_REVERSE) return list_prev_entry(sb, s_list); return list_next_entry(sb, s_list); } static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, enum super_iter_flags_t flags) { struct super_block *sb, *p = NULL; bool excl = flags & SUPER_ITER_EXCL; guard(spinlock)(&sb_lock); for (sb = first_super(flags); !list_entry_is_head(sb, &super_blocks, s_list); sb = next_super(sb, flags)) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); if (flags & SUPER_ITER_UNLOCKED) { f(sb, arg); } else if (super_lock(sb, excl)) { f(sb, arg); super_unlock(sb, excl); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); } void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { __iterate_supers(f, arg, 0); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool locked; if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool locked; if (sb->s_dev != dev) continue; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock(sb, excl); if (locked) return sb; spin_lock(&sb_lock); __put_super(sb); break; } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb, void *unused) { if (sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback, NULL, SUPER_ITER_EXCL | SUPER_ITER_REVERSE); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb, void *unused) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE, NULL); return; } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static inline bool get_active_super(struct super_block *sb) { bool active = false; if (super_lock_excl(sb)) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } return active; } static const char *filesystems_freeze_ptr = "filesystems_freeze"; static void filesystems_freeze_callback(struct super_block *sb, void *unused) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; if (!get_active_super(sb)) return; if (sb->s_op->freeze_super) sb->s_op->freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); else freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); deactivate_super(sb); } void filesystems_freeze(void) { __iterate_supers(filesystems_freeze_callback, NULL, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } static void filesystems_thaw_callback(struct super_block *sb, void *unused) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; if (!get_active_super(sb)) return; if (sb->s_op->thaw_super) sb->s_op->thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); else thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, filesystems_freeze_ptr); deactivate_super(sb); } void filesystems_thaw(void) { __iterate_supers(filesystems_thaw_callback, NULL, SUPER_ITER_UNLOCKED); } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock the superblock that is holder of the bdev. Returns the superblock * pointer if we successfully locked the superblock and it is alive. Otherwise * we return NULL and just unlock bdev->bd_holder_lock. * * The function must be called with bdev->bd_holder_lock and releases it. */ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); lockdep_assert_not_held(&bdev->bd_disk->open_mutex); /* Make sure sb doesn't go away from under us */ spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); locked = super_lock(sb, excl); /* * If the superblock wasn't already SB_DYING then we hold * s_umount and can safely drop our temporary reference. */ put_super(sb); if (!locked) return NULL; if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock(sb, excl); return NULL; } return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); } static struct super_block *get_bdev_super(struct block_device *bdev) { bool active = false; struct super_block *sb; sb = bdev_super_lock(bdev, true); if (sb) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } if (!active) return NULL; return sb; } /** * fs_bdev_freeze - freeze owning filesystem of block device * @bdev: block device * * Freeze the filesystem that owns this block device if it is still * active. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ static int fs_bdev_freeze(struct block_device *bdev) { struct super_block *sb; int error = 0; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); else error = freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); return error; } /** * fs_bdev_thaw - thaw owning filesystem of block device * @bdev: block device * * Thaw the filesystem that owns this block device. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the thaw was successful zero is returned. If the thaw * failed a negative error code is returned. If this function * returns zero it doesn't mean that the filesystem is unfrozen * as it may have been frozen multiple times (kernel may hold a * freeze or might be frozen from other block devices). */ static int fs_bdev_thaw(struct block_device *bdev) { struct super_block *sb; int error; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); /* * The block device may have been frozen before it was claimed by a * filesystem. Concurrently another process might try to mount that * frozen block device and has temporarily claimed the block device for * that purpose causing a concurrent fs_bdev_thaw() to end up here. The * mounter is already about to abort mounting because they still saw an * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return * NULL in that case. */ sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); else error = thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); deactivate_super(sb); return error; } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct file *bdev_file; struct block_device *bdev; bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev_file); } bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { bdev_fput(bdev_file); return -EACCES; } /* * It is enough to check bdev was not frozen before we set * s_bdev as freezing will wait until SB_BORN is set. */ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock * @flags: GET_TREE_BDEV_* flags */ int get_tree_bdev_flags(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { error = setup_bdev_super(s, fc->sb_flags, fc); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL_GPL(get_tree_bdev_flags); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { return get_tree_bdev_flags(fc, fill_super, 0); } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { error = setup_bdev_super(s, flags, NULL); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); bdev_fput(sb->s_bdev_file); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) #define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST | FREEZE_EXCL) static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) ++sb->s_writers.freeze_kcount; if (who & FREEZE_HOLDER_USERSPACE) ++sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) --sb->s_writers.freeze_kcount; if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) --sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline bool may_freeze(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { lockdep_assert_held(&sb->s_umount); WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_EXCL) { if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(!freeze_owner)) return false; /* This freeze already has a specific owner. */ if (sb->s_writers.freeze_owner) return false; /* * This is already frozen multiple times so we're just * going to take a reference count and mark the freeze as * being owned by the caller. */ if (sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) sb->s_writers.freeze_owner = freeze_owner; return true; } if (who & FREEZE_HOLDER_KERNEL) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_kcount == 0; if (who & FREEZE_HOLDER_USERSPACE) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_ucount == 0; return false; } static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { lockdep_assert_held(&sb->s_umount); WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_EXCL) { if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL))) return false; if (WARN_ON_ONCE(!freeze_owner)) return false; if (WARN_ON_ONCE(sb->s_writers.freeze_kcount == 0)) return false; /* This isn't exclusively frozen. */ if (!sb->s_writers.freeze_owner) return false; /* This isn't exclusively frozen by us. */ if (sb->s_writers.freeze_owner != freeze_owner) return false; /* * This is still frozen multiple times so we're just * going to drop our reference count and undo our * exclusive freeze. */ if ((sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) > 1) sb->s_writers.freeze_owner = NULL; return true; } if (who & FREEZE_HOLDER_KERNEL) { /* * Someone's trying to steal the reference belonging to * @sb->s_writers.freeze_owner. */ if (sb->s_writers.freeze_kcount == 1 && sb->s_writers.freeze_owner) return false; return sb->s_writers.freeze_kcount > 0; } if (who & FREEZE_HOLDER_USERSPACE) return sb->s_writers.freeze_ucount > 0; return false; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * @freeze_owner: owner of the freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * A filesystem may hold multiple devices and thus a filesystems may be * frozen through the block layer via multiple block devices. In this * case the request is marked as being allowed to nest by passing * FREEZE_MAY_NEST. The filesystem remains frozen until all block * devices are unfrozen. If multiple freezes are attempted without * FREEZE_MAY_NEST -EBUSY will be returned. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { int ret; if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while freezing!"); return -EINVAL; } atomic_inc(&sb->s_active); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (may_freeze(sb, who, freeze_owner)) ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); else ret = -EBUSY; /* All freezers share a single active reference. */ deactivate_locked_super(sb); return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (sb_rdonly(sb)) { /* Nothing to do really... */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.freeze_owner = freeze_owner; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.freeze_owner = freeze_owner; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { int error = -EINVAL; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; if (!may_unfreeze(sb, who, freeze_owner)) goto out_unlock; /* * All freezers share a single active reference. * So just unlock in case there are any left. */ if (freeze_dec(sb, who)) goto out_unlock; if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; sb->s_writers.freeze_owner = NULL; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { pr_err("VFS: Filesystem thaw failed\n"); freeze_inc(sb, who); lockdep_sb_freeze_release(sb); goto out_unlock; } } sb->s_writers.frozen = SB_UNFROZEN; sb->s_writers.freeze_owner = NULL; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out_deactivate: deactivate_locked_super(sb); return 0; out_unlock: super_unlock_excl(sb); return error; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * @freeze_owner: owner of the freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed * * A filesystem may hold multiple devices and thus a filesystems may * have been frozen through the block layer via multiple block devices. * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while thawing!"); return -EINVAL; } return thaw_super_locked(sb, who, freeze_owner); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
26 19 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 // SPDX-License-Identifier: GPL-2.0-or-later /* * geniv: Shared IV generator code * * This file provides common code to IV generators such as seqiv. * * Copyright (c) 2007-2019 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/geniv.h> #include <crypto/internal/rng.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> static int aead_geniv_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm); return crypto_aead_setkey(ctx->child, key, keylen); } static int aead_geniv_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm); return crypto_aead_setauthsize(ctx->child, authsize); } static void aead_geniv_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_aead_spawn *spawn; struct aead_instance *inst; struct aead_alg *alg; unsigned int ivsize; unsigned int maxauthsize; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); ivsize = crypto_aead_alg_ivsize(alg); maxauthsize = crypto_aead_alg_maxauthsize(alg); err = -EINVAL; if (ivsize < sizeof(u64)) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", tmpl->name, alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", tmpl->name, alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = alg->base.cra_blocksize; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct aead_geniv_ctx); inst->alg.setkey = aead_geniv_setkey; inst->alg.setauthsize = aead_geniv_setauthsize; inst->alg.ivsize = ivsize; inst->alg.maxauthsize = maxauthsize; inst->free = aead_geniv_free; out: return inst; err_free_inst: aead_geniv_free(inst); inst = ERR_PTR(err); goto out; } EXPORT_SYMBOL_GPL(aead_geniv_alloc); int aead_init_geniv(struct crypto_aead *aead) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(aead); struct aead_instance *inst = aead_alg_instance(aead); struct crypto_aead *child; int err; spin_lock_init(&ctx->lock); err = crypto_get_default_rng(); if (err) goto out; err = crypto_rng_get_bytes(crypto_default_rng, ctx->salt, crypto_aead_ivsize(aead)); crypto_put_default_rng(); if (err) goto out; child = crypto_spawn_aead(aead_instance_ctx(inst)); err = PTR_ERR(child); if (IS_ERR(child)) goto out; ctx->child = child; crypto_aead_set_reqsize(aead, crypto_aead_reqsize(child) + sizeof(struct aead_request)); err = 0; out: return err; } EXPORT_SYMBOL_GPL(aead_init_geniv); void aead_exit_geniv(struct crypto_aead *tfm) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } EXPORT_SYMBOL_GPL(aead_exit_geniv); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Shared IV generator code");
4 3 3 3 3 7 7 7 4 4 4 2 4 1 3 2 3 15 5 10 10 5 5 4 3 1 12 12 12 12 12 4 4 4 12 1 1 12 12 12 12 9 8 8 1 1 12 9 9 9 9 12 12 3 3 26 26 26 9 25 12 12 12 12 12 12 12 12 3 12 12 1 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 9 9 10 10 7 9 2 2 2 12 12 6 10 12 12 11 1 2 3 3 6 7 17 9 17 17 2 1 1 2 2 2 2 2 2 2 8 7 3 1 2 1 1 1 1 1 16 16 16 16 72 72 73 60 8 8 3 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2011 ProFUSION Embedded Systems Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI core. */ #include <linux/export.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> #include <linux/kcov.h> #include <linux/property.h> #include <linux/suspend.h> #include <linux/wait.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "hci_debugfs.h" #include "smp.h" #include "leds.h" #include "msft.h" #include "aosp.h" #include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); static void hci_tx_work(struct work_struct *work); /* HCI device list */ LIST_HEAD(hci_dev_list); DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ struct hci_dev *hci_dev_get(int index) { struct hci_dev *hdev = NULL, *d; BT_DBG("%d", index); if (index < 0) return NULL; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); break; } } read_unlock(&hci_dev_list_lock); return hdev; } /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) { struct discovery_state *discov = &hdev->discovery; switch (discov->state) { case DISCOVERY_FINDING: case DISCOVERY_RESOLVING: return true; default: return false; } } void hci_discovery_set_state(struct hci_dev *hdev, int state) { int old_state = hdev->discovery.state; if (old_state == state) return; hdev->discovery.state = state; switch (state) { case DISCOVERY_STOPPED: hci_update_passive_scan(hdev); if (old_state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: break; case DISCOVERY_STOPPING: break; } bt_dev_dbg(hdev, "state %u -> %u", old_state, state); } void hci_inquiry_cache_flush(struct hci_dev *hdev) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *p, *n; list_for_each_entry_safe(p, n, &cache->all, all) { list_del(&p->all); kfree(p); } INIT_LIST_HEAD(&cache->unknown); INIT_LIST_HEAD(&cache->resolve); } struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->all, all) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->unknown, list) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev, bdaddr_t *bdaddr, int state) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state); list_for_each_entry(e, &cache->resolve, list) { if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state) return e; if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, struct inquiry_entry *ie) { struct discovery_state *cache = &hdev->discovery; struct list_head *pos = &cache->resolve; struct inquiry_entry *p; list_del(&ie->list); list_for_each_entry(p, &cache->resolve, list) { if (p->name_state != NAME_PENDING && abs(p->data.rssi) >= abs(ie->data.rssi)) break; pos = &p->list; } list_add(&ie->list, pos); } u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, bool name_known) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *ie; u32 flags = 0; BT_DBG("cache %p, %pMR", cache, &data->bdaddr); hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR); if (!data->ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { if (!ie->data.ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; if (ie->name_state == NAME_NEEDED && data->rssi != ie->data.rssi) { ie->data.rssi = data->rssi; hci_inquiry_cache_update_resolve(hdev, ie); } goto update; } /* Entry not in the cache. Add new one. */ ie = kzalloc(sizeof(*ie), GFP_KERNEL); if (!ie) { flags |= MGMT_DEV_FOUND_CONFIRM_NAME; goto done; } list_add(&ie->all, &cache->all); if (name_known) { ie->name_state = NAME_KNOWN; } else { ie->name_state = NAME_NOT_KNOWN; list_add(&ie->list, &cache->unknown); } update: if (name_known && ie->name_state != NAME_KNOWN && ie->name_state != NAME_PENDING) { ie->name_state = NAME_KNOWN; list_del(&ie->list); } memcpy(&ie->data, data, sizeof(*data)); ie->timestamp = jiffies; cache->timestamp = jiffies; if (ie->name_state == NAME_NOT_KNOWN) flags |= MGMT_DEV_FOUND_CONFIRM_NAME; done: return flags; } static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) { struct discovery_state *cache = &hdev->discovery; struct inquiry_info *info = (struct inquiry_info *) buf; struct inquiry_entry *e; int copied = 0; list_for_each_entry(e, &cache->all, all) { struct inquiry_data *data = &e->data; if (copied >= num) break; bacpy(&info->bdaddr, &data->bdaddr); info->pscan_rep_mode = data->pscan_rep_mode; info->pscan_period_mode = data->pscan_period_mode; info->pscan_mode = data->pscan_mode; memcpy(info->dev_class, data->dev_class, 3); info->clock_offset = data->clock_offset; info++; copied++; } BT_DBG("cache %p, copied %d", cache, copied); return copied; } int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; struct hci_inquiry_req ir; struct hci_dev *hdev; int err = 0, do_inquiry = 0, max_rsp; __u8 *buf; if (copy_from_user(&ir, ptr, sizeof(ir))) return -EFAULT; hdev = hci_dev_get(ir.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } /* Restrict maximum inquiry length to 60 seconds */ if (ir.length > 60) { err = -EINVAL; goto done; } hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { hci_inquiry_cache_flush(hdev); do_inquiry = 1; } hci_dev_unlock(hdev); if (do_inquiry) { hci_req_sync_lock(hdev); err = hci_inquiry_sync(hdev, ir.length, ir.num_rsp); hci_req_sync_unlock(hdev); if (err < 0) goto done; /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) { err = -EINTR; goto done; } } /* for unlimited number of responses we will use buffer with * 255 entries */ max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; /* cache_dump can't sleep. Therefore we allocate temp buffer and then * copy it to the user space. */ buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL); if (!buf) { err = -ENOMEM; goto done; } hci_dev_lock(hdev); ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); hci_dev_unlock(hdev); BT_DBG("num_rsp %d", ir.num_rsp); if (!copy_to_user(ptr, &ir, sizeof(ir))) { ptr += sizeof(ir); if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * ir.num_rsp)) err = -EFAULT; } else err = -EFAULT; kfree(buf); done: hci_dev_put(hdev); return err; } static int hci_dev_do_open(struct hci_dev *hdev) { int ret = 0; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; } /* ---- HCI ioctl helpers ---- */ int hci_dev_open(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; /* Devices that are marked as unconfigured can only be powered * up as user channel. Trying to bring them up as normal devices * will result into a failure. Only user channel operation is * possible. * * When this function is called for a user channel, the flag * HCI_USER_CHANNEL will be set first before attempting to * open the device. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EOPNOTSUPP; goto done; } /* We need to ensure that no other power on/off work is pending * before proceeding to call hci_dev_do_open. This is * particularly important if the setup procedure has not yet * completed. */ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); /* After this call it is guaranteed that the setup procedure * has finished. This means that error conditions like RFKILL * or no valid public or static random address apply. */ flush_workqueue(hdev->req_workqueue); /* For controllers not using the management interface and that * are brought up using legacy ioctl, set the HCI_BONDABLE bit * so that pairing works for them. Once the management interface * is in use this bit will be cleared again and userspace has * to explicitly enable it. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !hci_dev_test_flag(hdev, HCI_MGMT)) hci_dev_set_flag(hdev, HCI_BONDABLE); err = hci_dev_do_open(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_do_close(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_dev_close_sync(hdev); hci_req_sync_unlock(hdev); return err; } int hci_dev_close(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); err = hci_dev_do_close(hdev); done: hci_dev_put(hdev); return err; } static int hci_dev_do_reset(struct hci_dev *hdev) { int ret; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); /* Cancel these to avoid queueing non-chained pending work */ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); /* Wait for * * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) * queue_delayed_work(&hdev->{cmd,ncmd}_timer) * * inside RCU section to see the flag or complete scheduling. */ synchronize_rcu(); /* Explicitly cancel works in case scheduled after setting the flag. */ cancel_delayed_work(&hdev->cmd_timer); cancel_delayed_work(&hdev->ncmd_timer); /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); hci_dev_unlock(hdev); if (hdev->flush) hdev->flush(hdev); hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; hdev->iso_cnt = 0; ret = hci_reset_sync(hdev); hci_req_sync_unlock(hdev); return ret; } int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto done; } if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } err = hci_dev_do_reset(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_reset_stat(__u16 dev) { struct hci_dev *hdev; int ret = 0; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { ret = -EOPNOTSUPP; goto done; } memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); done: hci_dev_put(hdev); return ret; } static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan) { bool conn_changed, discov_changed; BT_DBG("%s scan 0x%02x", hdev->name, scan); if ((scan & SCAN_PAGE)) conn_changed = !hci_dev_test_and_set_flag(hdev, HCI_CONNECTABLE); else conn_changed = hci_dev_test_and_clear_flag(hdev, HCI_CONNECTABLE); if ((scan & SCAN_INQUIRY)) { discov_changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); } else { hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); discov_changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); } if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; if (conn_changed || discov_changed) { /* In case this was disabled through mgmt */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) hci_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } } int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; __le16 policy; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) return -EFAULT; hdev = hci_dev_get(dr.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } switch (cmd) { case HCISETAUTH: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETENCRYPT: if (!lmp_encrypt_capable(hdev)) { err = -EOPNOTSUPP; break; } if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); if (err) break; } err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETSCAN: err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &dr.dev_opt, HCI_CMD_TIMEOUT); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. */ if (!err) hci_update_passive_scan_state(hdev, dr.dev_opt); break; case HCISETLINKPOL: policy = cpu_to_le16(dr.dev_opt); err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy, HCI_CMD_TIMEOUT); break; case HCISETLINKMODE: hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT); break; case HCISETPTYPE: if (hdev->pkt_type == (__u16) dr.dev_opt) break; hdev->pkt_type = (__u16) dr.dev_opt; mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0); break; case HCISETSCOMTU: hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0); break; default: err = -EINVAL; break; } done: hci_dev_put(hdev); return err; } int hci_get_dev_list(void __user *arg) { struct hci_dev *hdev; struct hci_dev_list_req *dl; struct hci_dev_req *dr; int n = 0, err; __u16 dev_num; if (get_user(dev_num, (__u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) return -EINVAL; dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL); if (!dl) return -ENOMEM; dl->dev_num = dev_num; dr = dl->dev_req; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { unsigned long flags = hdev->flags; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags &= ~BIT(HCI_UP); dr[n].dev_id = hdev->id; dr[n].dev_opt = flags; if (++n >= dev_num) break; } read_unlock(&hci_dev_list_lock); dl->dev_num = n; err = copy_to_user(arg, dl, struct_size(dl, dev_req, n)); kfree(dl); return err ? -EFAULT : 0; } int hci_get_dev_info(void __user *arg) { struct hci_dev *hdev; struct hci_dev_info di; unsigned long flags; int err = 0; if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; hdev = hci_dev_get(di.dev_id); if (!hdev) return -ENODEV; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags = hdev->flags & ~BIT(HCI_UP); else flags = hdev->flags; strscpy(di.name, hdev->name, sizeof(di.name)); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f); di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { di.acl_mtu = hdev->acl_mtu; di.acl_pkts = hdev->acl_pkts; di.sco_mtu = hdev->sco_mtu; di.sco_pkts = hdev->sco_pkts; } else { di.acl_mtu = hdev->le_mtu; di.acl_pkts = hdev->le_pkts; di.sco_mtu = 0; di.sco_pkts = 0; } di.link_policy = hdev->link_policy; di.link_mode = hdev->link_mode; memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); memcpy(&di.features, &hdev->features, sizeof(di.features)); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; hci_dev_put(hdev); return err; } /* ---- Interface to HCI drivers ---- */ static int hci_dev_do_poweroff(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_set_powered_sync(hdev, false); hci_req_sync_unlock(hdev); return err; } static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; int err; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED)) return 0; if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { err = hci_dev_do_poweroff(hdev); if (err) { bt_dev_err(hdev, "Error when powering off device on rfkill (%d)", err); /* Make sure the device is still closed even if * anything during power off sequence (eg. * disconnecting devices) failed. */ hci_dev_do_close(hdev); } } } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } return 0; } static const struct rfkill_ops hci_rfkill_ops = { .set_block = hci_rfkill_set_block, }; static void hci_power_on(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_on); int err; BT_DBG("%s", hdev->name); if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); err = hci_powered_update_sync(hdev); mgmt_power_on(hdev, err); return; } err = hci_dev_do_open(hdev); if (err < 0) { hci_dev_lock(hdev); mgmt_set_powered_failed(hdev, err); hci_dev_unlock(hdev); return; } /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to turn the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } } static void hci_power_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_off.work); BT_DBG("%s", hdev->name); hci_dev_do_close(hdev); } static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); hci_dev_hold(hdev); BT_DBG("%s", hdev->name); if (hdev->hw_error) hdev->hw_error(hdev, hdev->hw_error_code); else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); if (!hci_dev_do_close(hdev)) hci_dev_do_open(hdev); hci_dev_put(hdev); } void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) { list_del(&uuid->list); kfree(uuid); } } void hci_link_keys_clear(struct hci_dev *hdev) { struct link_key *key, *tmp; list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_smp_irks_clear(struct hci_dev *hdev) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_blocked_keys_clear(struct hci_dev *hdev) { struct blocked_key *b, *tmp; list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } } bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) { bool blocked = false; struct blocked_key *b; rcu_read_lock(); list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { blocked = true; break; } } rcu_read_unlock(); return blocked; } struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LINKKEY, k->val)) { bt_dev_warn_ratelimited(hdev, "Link key blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, u8 key_type, u8 old_key_type) { /* Legacy key */ if (key_type < 0x03) return true; /* Debug keys are insecure so don't store them persistently */ if (key_type == HCI_LK_DEBUG_COMBINATION) return false; /* Changed combination key and there's no previous one */ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) return false; /* Security mode 3 case */ if (!conn) return true; /* BR/EDR key derived using SC from an LE link */ if (conn->type == LE_LINK) return true; /* Neither local nor remote side had no-bonding as requirement */ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) return true; /* Local side had dedicated bonding as requirement */ if (conn->auth_type == 0x02 || conn->auth_type == 0x03) return true; /* Remote side had dedicated bonding as requirement */ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) return true; /* If none of the above criteria match, then don't store the key * persistently */ return false; } static u8 ltk_role(u8 type) { if (type == SMP_LTK) return HCI_ROLE_MASTER; return HCI_ROLE_SLAVE; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 role) { struct smp_ltk *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr)) continue; if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, k->val)) { bt_dev_warn_ratelimited(hdev, "LTK blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { irk_to_return = irk; goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0) return NULL; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len, bool *persistent) { struct link_key *key, *old_key; u8 old_key_type; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { old_key_type = old_key->type; key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->link_keys); } BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type); /* Some buggy controller combinations generate a changed * combination key for legacy pairing even when there's no * previous key */ if (type == HCI_LK_CHANGED_COMBINATION && (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) { type = HCI_LK_COMBINATION; if (conn) conn->key_type = type; } bacpy(&key->bdaddr, bdaddr); memcpy(key->val, val, HCI_LINK_KEY_SIZE); key->pin_len = pin_len; if (type == HCI_LK_CHANGED_COMBINATION) key->type = old_key_type; else key->type = type; if (persistent) *persistent = hci_persistent_key(hdev, conn, type, old_key_type); return key; } struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, u8 authenticated, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; u8 role = ltk_role(type); old_key = hci_find_ltk(hdev, bdaddr, addr_type, role); if (old_key) key = old_key; else { key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->long_term_keys); } bacpy(&key->bdaddr, bdaddr); key->bdaddr_type = addr_type; memcpy(key->val, tk, sizeof(key->val)); key->authenticated = authenticated; key->ediv = ediv; key->rand = rand; key->enc_size = enc_size; key->type = type; return key; } struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa) { struct smp_irk *irk; irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); if (!irk) { irk = kzalloc(sizeof(*irk), GFP_KERNEL); if (!irk) return NULL; bacpy(&irk->bdaddr, bdaddr); irk->addr_type = addr_type; list_add_rcu(&irk->list, &hdev->identity_resolving_keys); } memcpy(irk->val, val, 16); bacpy(&irk->rpa, rpa); return irk; } int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *key; key = hci_find_link_key(hdev, bdaddr); if (!key) return -ENOENT; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&key->list); kfree_rcu(key, rcu); return 0; } int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct smp_ltk *k, *tmp; int removed = 0; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); removed++; } return removed ? 0 : -ENOENT; } void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); } } bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct smp_ltk *k; struct smp_irk *irk; u8 addr_type; if (type == BDADDR_BREDR) { if (hci_find_link_key(hdev, bdaddr)) return true; return false; } /* Convert to HCI addr type which struct smp_ltk uses */ if (type == BDADDR_LE_PUBLIC) addr_type = ADDR_LE_DEV_PUBLIC; else addr_type = ADDR_LE_DEV_RANDOM; irk = hci_get_irk(hdev, bdaddr, addr_type); if (irk) { bdaddr = &irk->bdaddr; addr_type = irk->addr_type; } rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* HCI command timer function */ static void hci_cmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_timer.work); if (hdev->req_skb) { u16 opcode = hci_skb_opcode(hdev->req_skb); bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT); } else { bt_dev_err(hdev, "command tx timeout"); } if (hdev->reset) hdev->reset(hdev); atomic_set(&hdev->cmd_cnt, 1); queue_work(hdev->workqueue, &hdev->cmd_work); } /* HCI ncmd timer function */ static void hci_ncmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, ncmd_timer.work); bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); /* During HCI_INIT phase no events can be injected if the ncmd timer * triggers since the procedure has its own timeout handling. */ if (test_bit(HCI_INIT, &hdev->flags)) return; /* This is an irrecoverable state, inject hardware error event */ hci_reset_dev(hdev); } struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; list_for_each_entry(data, &hdev->remote_oob_data, list) { if (bacmp(bdaddr, &data->bdaddr) != 0) continue; if (data->bdaddr_type != bdaddr_type) continue; return data; } return NULL; } int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) return -ENOENT; BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type); list_del(&data->list); kfree(data); return 0; } void hci_remote_oob_data_clear(struct hci_dev *hdev) { struct oob_data *data, *n; list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) { list_del(&data->list); kfree(data); } } int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 *hash192, u8 *rand192, u8 *hash256, u8 *rand256) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) { data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; bacpy(&data->bdaddr, bdaddr); data->bdaddr_type = bdaddr_type; list_add(&data->list, &hdev->remote_oob_data); } if (hash192 && rand192) { memcpy(data->hash192, hash192, sizeof(data->hash192)); memcpy(data->rand192, rand192, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x03; } else { memset(data->hash192, 0, sizeof(data->hash192)); memset(data->rand192, 0, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x02; else data->present = 0x00; } if (hash256 && rand256) { memcpy(data->hash256, hash256, sizeof(data->hash256)); memcpy(data->rand256, rand256, sizeof(data->rand256)); } else { memset(data->hash256, 0, sizeof(data->hash256)); memset(data->rand256, 0, sizeof(data->rand256)); if (hash192 && rand192) data->present = 0x01; } BT_DBG("%s for %pMR", hdev->name, bdaddr); return 0; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { if (adv_instance->instance == instance) return adv_instance; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid) { struct adv_info *adv; list_for_each_entry(adv, &hdev->adv_instances, list) { if (adv->sid == sid) return adv; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *cur_instance; cur_instance = hci_find_adv_instance(hdev, instance); if (!cur_instance) return NULL; if (cur_instance == list_last_entry(&hdev->adv_instances, struct adv_info, list)) return list_first_entry(&hdev->adv_instances, struct adv_info, list); else return list_next_entry(cur_instance, list); } /* This function requires the caller holds hdev->lock */ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return -ENOENT; BT_DBG("%s removing %dMR", hdev->name, instance); if (hdev->cur_adv_instance == instance) { if (hdev->adv_instance_timeout) { cancel_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } hdev->cur_adv_instance = 0x00; } cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); hdev->adv_instance_cnt--; return 0; } void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) { struct adv_info *adv_instance, *n; list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) adv_instance->rpa_expired = rpa_expired; } /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { struct adv_info *adv_instance, *n; if (hdev->adv_instance_timeout) { disable_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { disable_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; } static void adv_instance_rpa_expired(struct work_struct *work) { struct adv_info *adv_instance = container_of(work, struct adv_info, rpa_expired_cb.work); BT_DBG(""); adv_instance->rpa_expired = true; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); if (adv) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data)); } else { if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); adv = kzalloc(sizeof(*adv), GFP_KERNEL); if (!adv) return ERR_PTR(-ENOMEM); adv->pending = true; adv->instance = instance; /* If controller support only one set and the instance is set to * 1 then there is no option other than using handle 0x00. */ if (hdev->le_num_of_adv_sets == 1 && instance == 1) adv->handle = 0x00; else adv->handle = instance; list_add(&adv->list, &hdev->adv_instances); hdev->adv_instance_cnt++; } adv->flags = flags; adv->min_interval = min_interval; adv->max_interval = max_interval; adv->tx_power = tx_power; /* Defining a mesh_handle changes the timing units to ms, * rather than seconds, and ties the instance to the requested * mesh_tx queue. */ adv->mesh = mesh_handle; hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, scan_rsp_len, scan_rsp_data); adv->timeout = timeout; adv->remaining_time = timeout; if (duration == 0) adv->duration = hdev->def_multi_adv_rotation_duration; else adv->duration = duration; INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired); BT_DBG("%s for %dMR", hdev->name, instance); return adv; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { struct adv_info *adv; adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL, 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE, min_interval, max_interval, 0); if (IS_ERR(adv)) return adv; adv->sid = sid; adv->periodic = true; adv->per_adv_data_len = data_len; if (data) memcpy(adv->per_adv_data, data, data_len); return adv; } /* This function requires the caller holds hdev->lock */ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); /* If advertisement doesn't exist, we can't modify its data */ if (!adv) return -ENOENT; if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memcpy(adv->adv_data, adv_data, adv_data_len); adv->adv_data_len = adv_data_len; adv->adv_data_changed = true; } if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) { memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len); adv->scan_rsp_len = scan_rsp_len; adv->scan_rsp_changed = true; } /* Mark as changed if there are flags which would affect it */ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) adv->scan_rsp_changed = true; return 0; } /* This function requires the caller holds hdev->lock */ u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; struct adv_info *adv; if (instance == 0x00) { /* Instance 0 always manages the "Tx Power" and "Flags" * fields */ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting * corresponds to the "connectable" instance flag. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) flags |= MGMT_ADV_FLAG_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_DISCOV; return flags; } adv = hci_find_adv_instance(hdev, instance); /* Return 0 when we got an invalid instance identifier. */ if (!adv) return 0; return adv->flags; } bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) { struct adv_info *adv; /* Instance 0x00 always set local name */ if (instance == 0x00) return true; adv = hci_find_adv_instance(hdev, instance); if (!adv) return false; if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) return true; return adv->scan_rsp_len ? true : false; } /* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } /* Frees the monitor structure and do some bookkeepings. * This function requires the caller holds hdev->lock. */ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; if (!monitor) return; list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { list_del(&pattern->list); kfree(pattern); } if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) hdev->adv_monitors_cnt--; kfree(monitor); } /* Assigns handle to a monitor, and if offloading is supported and power is on, * also attempts to forward the request to the controller. * This function requires the caller holds hci_req_sync_lock. */ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int min, max, handle; int status = 0; if (!monitor) return -EINVAL; hci_dev_lock(hdev); min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); hci_dev_unlock(hdev); if (handle < 0) return handle; monitor->handle = handle; if (!hdev_is_powered(hdev)) return status; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); bt_dev_dbg(hdev, "add monitor %d msft status %d", handle, status); break; } return status; } /* Attempts to tell the controller and free the monitor. If somehow the * controller doesn't have a corresponding handle, remove anyway. * This function requires the caller holds hci_req_sync_lock. */ static int hci_remove_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int status = 0; int handle; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); bt_dev_dbg(hdev, "remove monitor %d msft status %d", handle, status); break; } /* In case no matching handle registered, just free the monitor */ if (status == -ENOENT) goto free_monitor; return status; free_monitor: if (status == -ENOENT) bt_dev_warn(hdev, "Removing monitor with no matching handle %d", monitor->handle); hci_free_adv_monitor(hdev, monitor); return status; } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle) { struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); if (!monitor) return -EINVAL; return hci_remove_adv_monitor(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_all_adv_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; int idr_next_id = 0; int status = 0; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) break; status = hci_remove_adv_monitor(hdev, monitor); if (status) return status; idr_next_id++; } return status; } /* This function requires the caller holds hdev->lock */ bool hci_is_adv_monitoring(struct hci_dev *hdev) { return !idr_is_empty(&hdev->adv_monitors_idr); } int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) { if (msft_monitor_supported(hdev)) return HCI_ADV_MONITOR_EXT_MSFT; return HCI_ADV_MONITOR_EXT_NONE; } struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_flags * hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_flags *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; list_for_each_entry_safe(b, n, bdaddr_list, list) { list_del(&b->list); kfree(b); } } int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type, u8 *peer_irk, u8 *local_irk) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; if (peer_irk) memcpy(entry->peer_irk, peer_irk, 16); if (local_irk) memcpy(entry->local_irk, local_irk, 16); list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type, u32 flags) { struct bdaddr_list_with_flags *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; entry->flags = flags; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(&params->addr, addr) == 0 && params->addr_type == addr_type) { return params; } } return NULL; } /* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; rcu_read_lock(); list_for_each_entry_rcu(param, list, action) { if (bacmp(&param->addr, addr) == 0 && param->addr_type == addr_type) { rcu_read_unlock(); return param; } } rcu_read_unlock(); return NULL; } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_del_init(struct hci_conn_params *param) { if (list_empty(&param->action)) return; list_del_rcu(&param->action); synchronize_rcu(); INIT_LIST_HEAD(&param->action); } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_add(struct hci_conn_params *param, struct list_head *list) { list_add_rcu(&param->action, list); } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { bt_dev_err(hdev, "out of memory"); return NULL; } bacpy(&params->addr, addr); params->addr_type = addr_type; list_add(&params->list, &hdev->le_conn_params); INIT_LIST_HEAD(&params->action); params->conn_min_interval = hdev->le_conn_min_interval; params->conn_max_interval = hdev->le_conn_max_interval; params->conn_latency = hdev->le_conn_latency; params->supervision_timeout = hdev->le_supv_timeout; params->auto_connect = HCI_AUTO_CONN_DISABLED; BT_DBG("addr %pMR (type %u)", addr, addr_type); return params; } void hci_conn_params_free(struct hci_conn_params *params) { hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } list_del(&params->list); kfree(params); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) return; hci_conn_params_free(params); hci_update_passive_scan(hdev); BT_DBG("addr %pMR (type %u)", addr, addr_type); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_clear_disabled(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { params->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); } /* This function requires the caller holds hdev->lock */ static void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) hci_conn_params_free(params); BT_DBG("All LE connection parameters were removed"); } /* Copy the Identity Address of the controller. * * If the controller has a public BD_ADDR, then by default use that one. * If this is a LE only controller without a public address, default to * the static random address. * * For debugging purposes it is possible to force controllers with a * public address to use the static random address instead. * * In case BR/EDR has been disabled on a dual-mode controller and * userspace has configured a static address, then that address * becomes the identity address instead of the public BR/EDR address. */ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; } else { bacpy(bdaddr, &hdev->bdaddr); *bdaddr_type = ADDR_LE_DEV_PUBLIC; } } static void hci_clear_wake_reason(struct hci_dev *hdev) { hci_dev_lock(hdev); hdev->wake_reason = 0; bacpy(&hdev->wake_addr, BDADDR_ANY); hdev->wake_addr_type = 0; hci_dev_unlock(hdev); } static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; /* Userspace has full control of this device. Do nothing. */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: ret = hci_suspend_dev(hdev); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: ret = hci_resume_dev(hdev); break; } if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); hci_dev_put(hdev); return NOTIFY_DONE; } /* Alloc HCI device */ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; unsigned int alloc_size; alloc_size = sizeof(*hdev); if (sizeof_priv) { /* Fixme: May need ALIGN-ment? */ alloc_size += sizeof_priv; } hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); hdev->num_iac = 0x01; /* One IAC support is mandatory */ hdev->io_capability = 0x03; /* No Input No Output */ hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; hdev->adv_instance_timeout = 0; hdev->advmon_allowlist_duration = 300; hdev->advmon_no_filter_duration = 500; hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */ hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; hdev->le_adv_channel_map = 0x07; hdev->le_adv_min_interval = 0x0800; hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1; hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST; hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST; hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN; hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; hdev->le_supv_timeout = 0x002a; hdev->le_def_tx_len = 0x001b; hdev->le_def_tx_time = 0x0148; hdev->le_max_tx_len = 0x001b; hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT; hdev->min_le_tx_power = HCI_TX_POWER_INVALID; hdev->max_le_tx_power = HCI_TX_POWER_INVALID; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; /* default 1.28 sec page scan */ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; hdev->def_page_scan_int = 0x0800; hdev->def_page_scan_window = 0x0012; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); INIT_LIST_HEAD(&hdev->mesh_pending); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->reject_list); INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); INIT_LIST_HEAD(&hdev->monitored_devices); INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); INIT_WORK(&hdev->power_on, hci_power_on); INIT_WORK(&hdev->error_reset, hci_error_reset); hci_cmd_sync_init(hdev); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); skb_queue_head_init(&hdev->raw_q); init_waitqueue_head(&hdev->req_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_devcd_setup(hdev); hci_init_sysfs(hdev); discovery_init(hdev); return hdev; } EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) { /* will free via device release */ put_device(&hdev->dev); } EXPORT_SYMBOL(hci_free_dev); /* Register HCI device */ int hci_register_dev(struct hci_dev *hdev) { int id, error; if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL); if (id < 0) return id; error = dev_set_name(&hdev->dev, "hci%u", id); if (error) return error; hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); hdev->workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->workqueue) { error = -ENOMEM; goto err; } hdev->req_workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->req_workqueue) { destroy_workqueue(hdev->workqueue); error = -ENOMEM; goto err; } if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; hci_leds_init(hdev); hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); if (hdev->rfkill) { if (rfkill_register(hdev->rfkill) < 0) { rfkill_destroy(hdev->rfkill); hdev->rfkill = NULL; } } if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) hci_dev_set_flag(hdev, HCI_RFKILLED); hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); write_lock(&hci_dev_list_lock); list_add(&hdev->list, &hci_dev_list); write_unlock(&hci_dev_list_lock); /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup * callback. */ if (hdev->wakeup) hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP; hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); error = hci_register_suspend_notifier(hdev); if (error) BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); msft_register(hdev); return id; err_wqueue: debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: ida_free(&hci_index_ida, hdev->id); return error; } EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); mutex_lock(&hdev->unregister_lock); hci_dev_set_flag(hdev, HCI_UNREGISTER); mutex_unlock(&hdev->unregister_lock); write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); disable_work_sync(&hdev->power_on); disable_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); } /* mgmt_index_removed should take care of emptying the * pending list */ BUG_ON(!list_empty(&hdev->mgmt_pending)); hci_sock_dev_event(hdev, HCI_DEV_UNREG); if (hdev->rfkill) { rfkill_unregister(hdev->rfkill); rfkill_destroy(hdev->rfkill); } device_del(&hdev->dev); /* Actual cleanup is deferred until hci_release_dev(). */ hci_dev_put(hdev); } EXPORT_SYMBOL(hci_unregister_dev); /* Release HCI device */ void hci_release_dev(struct hci_dev *hdev) { debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->reject_list); hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_codec_list_clear(&hdev->local_codecs); msft_release(hdev); hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); ida_free(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->req_skb); kfree_skb(hdev->recv_event); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); int hci_register_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (!hdev->suspend_notifier.notifier_call && !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } return ret; } int hci_unregister_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (hdev->suspend_notifier.notifier_call) { ret = unregister_pm_notifier(&hdev->suspend_notifier); if (!ret) hdev->suspend_notifier.notifier_call = NULL; } return ret; } /* Cancel ongoing command synchronously: * * - Cancel command timer * - Reset command counter * - Cancel command request */ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work_sync(&hdev->cmd_timer); disable_delayed_work_sync(&hdev->ncmd_timer); } else { cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); } atomic_set(&hdev->cmd_cnt, 1); hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Suspend should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to suspend */ if (mgmt_powering_down(hdev)) return 0; /* Cancel potentially blocking sync operation before suspend */ hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); hci_req_sync_unlock(hdev); hci_clear_wake_reason(hdev); mgmt_suspending(hdev, hdev->suspend_state); hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Resume should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to resume */ if (mgmt_powering_down(hdev)) return 0; hci_req_sync_lock(hdev); ret = hci_resume_sync(hdev); hci_req_sync_unlock(hdev); mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, hdev->wake_addr_type); hci_sock_dev_event(hdev, HCI_DEV_RESUME); return ret; } EXPORT_SYMBOL(hci_resume_dev); /* Reset HCI device */ int hci_reset_dev(struct hci_dev *hdev) { static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 }; struct sk_buff *skb; skb = bt_skb_alloc(3, GFP_ATOMIC); if (!skb) return -ENOMEM; hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); bt_dev_err(hdev, "Injecting HCI hardware error event"); /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } EXPORT_SYMBOL(hci_reset_dev); static u8 hci_dev_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb) { if (hdev->classify_pkt_type) return hdev->classify_pkt_type(hdev, skb); return hci_skb_pkt_type(skb); } /* Receive frame from HCI drivers */ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) { u8 dev_pkt_type; if (!hdev || (!test_bit(HCI_UP, &hdev->flags) && !test_bit(HCI_INIT, &hdev->flags))) { kfree_skb(skb); return -ENXIO; } /* Check if the driver agree with packet type classification */ dev_pkt_type = hci_dev_classify_pkt_type(hdev, skb); if (hci_skb_pkt_type(skb) != dev_pkt_type) { hci_skb_pkt_type(skb) = dev_pkt_type; } switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ if (hci_conn_num(hdev, CIS_LINK) || hci_conn_num(hdev, BIS_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); if (type == CIS_LINK || type == BIS_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; case HCI_SCODATA_PKT: break; case HCI_ISODATA_PKT: break; case HCI_DRV_PKT: break; default: kfree_skb(skb); return -EINVAL; } /* Incoming skb */ bt_cb(skb)->incoming = 1; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_frame); /* Receive diagnostic message from HCI drivers */ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) { /* Mark as diagnostic packet */ hci_skb_pkt_type(skb) = HCI_DIAG_PKT; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_diag); void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->hw_info); hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_hw_info); void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->fw_info); hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_fw_info); /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_add_tail(&cb->list, &hci_cb_list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_register_cb); int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_del(&cb->list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_unregister_cb); static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { int err; BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); /* Time stamp */ __net_timestamp(skb); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) { kfree_skb(skb); return -EINVAL; } if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { /* Intercept HCI Drv packet here and don't go with hdev->send * callback. */ err = hci_drv_process_cmd(hdev, skb); kfree_skb(skb); return err; } err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); kfree_skb(skb); return err; } return 0; } static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn, struct sk_buff *skb) { hci_conn_tx_queue(conn, skb); return hci_send_frame(hdev, skb); } /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) { struct sk_buff *skb; BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command"); return -ENOMEM; } /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { struct sk_buff *skb; if (hci_opcode_ogf(opcode) != 0x3f) { /* A controller receiving a command shall respond with either * a Command Status Event or a Command Complete Event. * Therefore, all standard HCI commands must be sent via the * standard API, using hci_send_cmd or hci_cmd_sync helpers. * Some vendors do not comply with this rule for vendor-specific * commands and do not return any event. We want to support * unresponded commands for such cases only. */ bt_dev_err(hdev, "unresponded command not supported"); return -EINVAL; } skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); return -ENOMEM; } hci_send_frame(hdev, skb); return 0; } EXPORT_SYMBOL(__hci_cmd_send); /* Get data from the previously sent command */ static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode) { struct hci_command_hdr *hdr; if (!skb || skb->len < HCI_COMMAND_HDR_SIZE) return NULL; hdr = (void *)skb->data; if (hdr->opcode != cpu_to_le16(opcode)) return NULL; return skb->data + HCI_COMMAND_HDR_SIZE; } /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { void *data; /* Check if opcode matches last sent command */ data = hci_cmd_data(hdev->sent_cmd, opcode); if (!data) /* Check if opcode matches last request */ data = hci_cmd_data(hdev->req_skb, opcode); return data; } /* Get data from last received event */ void *hci_recv_event_data(struct hci_dev *hdev, __u8 event) { struct hci_event_hdr *hdr; int offset; if (!hdev->recv_event) return NULL; hdr = (void *)hdev->recv_event->data; offset = sizeof(*hdr); if (hdr->evt != event) { /* In case of LE metaevent check the subevent match */ if (hdr->evt == HCI_EV_LE_META) { struct hci_ev_le_meta *ev; ev = (void *)hdev->recv_event->data + offset; offset += sizeof(*ev); if (ev->subevent == event) goto found; } return NULL; } found: bt_dev_dbg(hdev, "event 0x%2.2x", event); return hdev->recv_event->data + offset; } /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { struct hci_acl_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ACL_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_acl_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, struct sk_buff *skb, __u16 flags) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; struct sk_buff *list; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); list = skb_shinfo(skb)->frag_list; if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; /* Queue all fragments atomically. We need to use spin_lock_bh * here because of 6LoWPAN links, as there this function is * called from softirq and using normal spin lock could cause * deadlocks. */ spin_lock_bh(&queue->lock); __skb_queue_tail(queue, skb); flags &= ~ACL_START; flags |= ACL_CONT; do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); spin_unlock_bh(&queue->lock); } } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) { struct hci_dev *hdev = chan->conn->hdev; BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags); hci_queue_acl(chan, &chan->data_q, skb, flags); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send SCO data */ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct hci_sco_hdr hdr; BT_DBG("%s len %d", hdev->name, skb->len); hdr.handle = cpu_to_le16(conn->handle); hdr.dlen = skb->len; skb_push(skb, HCI_SCO_HDR_SIZE); skb_reset_transport_header(skb); memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send ISO data */ static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags) { struct hci_iso_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ISO_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_iso_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct sk_buff *list; __u16 flags; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; list = skb_shinfo(skb)->frag_list; flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; __skb_queue_tail(queue, skb); do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); } } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s len %d", hdev->name, skb->len); hci_queue_iso(conn, &conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* ---- HCI TX task (outgoing data) ---- */ /* HCI Connection scheduler */ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) { struct hci_dev *hdev; int cnt, q; if (!conn) { *quote = 0; return; } hdev = conn->hdev; switch (conn->type) { case ACL_LINK: cnt = hdev->acl_cnt; break; case SCO_LINK: case ESCO_LINK: cnt = hdev->sco_cnt; break; case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; case CIS_LINK: case BIS_LINK: cnt = hdev->iso_mtu ? hdev->iso_cnt : hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; default: cnt = 0; bt_dev_err(hdev, "unknown link type %d", conn->type); } q = cnt / num; *quote = q ? q : 1; } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, __u8 type2, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; unsigned int num = 0, min = ~0; /* We don't have to lock device here. Connections are always * added and removed with TX task disabled. */ rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if ((c->type != type && c->type != type2) || skb_queue_empty(&c->data_q)) continue; if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; num++; if (c->sent < min) { min = c->sent; conn = c; } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); hci_quote_sent(conn, num, quote); BT_DBG("conn %p quote %d", conn, *quote); return conn; } static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; bt_dev_err(hdev, "link tx timeout"); hci_dev_lock(hdev); /* Kill stalled connections */ list_for_each_entry(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); } } hci_dev_unlock(hdev); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_chan *chan = NULL; unsigned int num = 0, min = ~0, cur_prio = 0; struct hci_conn *conn; int conn_num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *tmp; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; conn_num++; list_for_each_entry_rcu(tmp, &conn->chan_list, list) { struct sk_buff *skb; if (skb_queue_empty(&tmp->data_q)) continue; skb = skb_peek(&tmp->data_q); if (skb->priority < cur_prio) continue; if (skb->priority > cur_prio) { num = 0; min = ~0; cur_prio = skb->priority; } num++; if (conn->sent < min) { min = conn->sent; chan = tmp; } } if (hci_conn_num(hdev, type) == conn_num) break; } rcu_read_unlock(); if (!chan) return NULL; hci_quote_sent(chan->conn, num, quote); BT_DBG("chan %p quote %d", chan, *quote); return chan; } static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn; int num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *chan; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; num++; list_for_each_entry_rcu(chan, &conn->chan_list, list) { struct sk_buff *skb; if (chan->sent) { chan->sent = 0; continue; } if (skb_queue_empty(&chan->data_q)) continue; skb = skb_peek(&chan->data_q); if (skb->priority >= HCI_PRIO_MAX - 1) continue; skb->priority = HCI_PRIO_MAX - 1; BT_DBG("chan %p skb %p promoted to %d", chan, skb, skb->priority); } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); } static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { unsigned long last_tx; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { case LE_LINK: last_tx = hdev->le_last_tx; break; default: last_tx = hdev->acl_last_tx; break; } /* tx timeout must be longer than maximum link supervision timeout * (40.9 seconds) */ if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) hci_link_tx_to(hdev, type); } /* Schedule SCO */ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; unsigned int pkts = hdev->sco_pkts; bt_dev_dbg(hdev, "type %u", type); if (!hci_conn_num(hdev, type) || !pkts) return; /* Use sco_pkts if flow control has not been enabled which will limit * the amount of buffer sent in a row. */ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) cnt = &pkts; else cnt = &hdev->sco_cnt; while (*cnt && (conn = hci_low_sent(hdev, type, type, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } /* Rescheduled if all packets were sent and flow control is not enabled * as there could be more packets queued that could not be sent and * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule * needs to be forced. */ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) queue_work(hdev->workqueue, &hdev->tx_work); } static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; struct hci_chan *chan; struct sk_buff *skb; int quote; __check_timeout(hdev, cnt, ACL_LINK); while (hdev->acl_cnt && (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); hci_send_conn_frame(hdev, chan->conn, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (cnt != hdev->acl_cnt) hci_prio_recalculate(hdev, ACL_LINK); } static void hci_sched_acl(struct hci_dev *hdev) { BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ if (!hci_conn_num(hdev, ACL_LINK)) return; hci_sched_acl_pkt(hdev); } static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; int quote, *cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; __check_timeout(hdev, *cnt, LE_LINK); tmp = *cnt; while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_send_conn_frame(hdev, chan->conn, skb); hdev->le_last_tx = jiffies; (*cnt)--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); } } if (*cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } /* Schedule CIS */ static void hci_sched_iso(struct hci_dev *hdev) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, CIS_LINK) && !hci_conn_num(hdev, BIS_LINK)) return; cnt = hdev->iso_pkts ? &hdev->iso_cnt : hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; while (*cnt && (conn = hci_low_sent(hdev, CIS_LINK, BIS_LINK, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } } static void hci_tx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work); struct sk_buff *skb; BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); hci_sched_iso(hdev); hci_sched_acl(hdev); hci_sched_le(hdev); } /* Send next queued raw (unknown type) packet */ while ((skb = skb_dequeue(&hdev->raw_q))) hci_send_frame(hdev, skb); } /* ----- HCI RX task (incoming data processing) ----- */ /* ACL data packet */ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.acl_rx++; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (conn) { hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF); /* Send to upper protocol */ l2cap_recv_acldata(conn, skb, flags); return; } else { bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); } drop: kfree_skb(skb); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hdev->stat.sco_rx++; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (conn) { /* Send to upper protocol */ hci_skb_pkt_status(skb) = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); } drop: kfree_skb(skb); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (!conn) { bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); goto drop; } /* Send to upper protocol */ iso_recv(conn, skb, flags); return; drop: kfree_skb(skb); } static bool hci_req_is_complete(struct hci_dev *hdev) { struct sk_buff *skb; skb = skb_peek(&hdev->cmd_q); if (!skb) return true; return (bt_cb(skb)->hci.req_flags & HCI_REQ_START); } static void hci_resend_last(struct hci_dev *hdev) { struct hci_command_hdr *sent; struct sk_buff *skb; u16 opcode; if (!hdev->sent_cmd) return; sent = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(sent->opcode); if (opcode == HCI_OP_RESET) return; skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); if (!skb) return; skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb) { struct sk_buff *skb; unsigned long flags; BT_DBG("opcode 0x%04x status 0x%02x", opcode, status); /* If the completed command doesn't match the last one that was * sent we need to do special handling of it. */ if (!hci_sent_cmd_data(hdev, opcode)) { /* Some CSR based controllers generate a spontaneous * reset complete event during init and any pending * command will never be completed. In such a case we * need to resend whatever was the last sent * command. */ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET) hci_resend_last(hdev); return; } /* If we reach this point this event matches the last command sent */ hci_dev_clear_flag(hdev, HCI_CMD_PENDING); /* If the command succeeded and there's still more commands in * this request the request is not yet complete. */ if (!status && !hci_req_is_complete(hdev)) return; skb = hdev->req_skb; /* If this was the last command in a request the complete * callback would be found in hdev->req_skb instead of the * command queue (hdev->cmd_q). */ if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) { *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; return; } if (skb && bt_cb(skb)->hci.req_complete) { *req_complete = bt_cb(skb)->hci.req_complete; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) { __skb_queue_head(&hdev->cmd_q, skb); break; } if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } static void hci_rx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work); struct sk_buff *skb; BT_DBG("%s", hdev->name); /* The kcov_remote functions used for collecting packet parsing * coverage information from this background thread and associate * the coverage with the syscall's thread which originally injected * the packet. This helps fuzzing the kernel. */ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* If the device has been opened in HCI_USER_CHANNEL, * the userspace has exclusive access to device. * When device is HCI_INIT, we still need to process * the data packets to the driver in order * to complete its setup(). */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: kfree_skb(skb); continue; } } /* Process frame */ switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: BT_DBG("%s Event packet", hdev->name); hci_event_packet(hdev, skb); break; case HCI_ACLDATA_PKT: BT_DBG("%s ACL data packet", hdev->name); hci_acldata_packet(hdev, skb); break; case HCI_SCODATA_PKT: BT_DBG("%s SCO data packet", hdev->name); hci_scodata_packet(hdev, skb); break; case HCI_ISODATA_PKT: BT_DBG("%s ISO data packet", hdev->name); hci_isodata_packet(hdev, skb); break; default: kfree_skb(skb); break; } } } static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) { int err; bt_dev_dbg(hdev, "skb %p", skb); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); if (!hdev->sent_cmd) { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return; } if (hci_skb_opcode(skb) != HCI_OP_NOP) { err = hci_send_frame(hdev, skb); if (err < 0) { hci_cmd_sync_cancel_sync(hdev, -err); return; } atomic_dec(&hdev->cmd_cnt); } if (hdev->req_status == HCI_REQ_PEND && !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) { kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt)) { skb = skb_dequeue(&hdev->cmd_q); if (!skb) return; hci_send_cmd_sync(hdev, skb); rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, HCI_CMD_TIMEOUT); rcu_read_unlock(); } }
210 209 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 // SPDX-License-Identifier: GPL-2.0-only /* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines * * Copyright (c) 2019-2020 Red Hat GmbH * * Author: Stefano Brivio <sbrivio@redhat.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <uapi/linux/netfilter/nf_tables.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <asm/fpu/api.h> #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" #define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG) /* Load from memory into YMM register with non-temporal hint ("stream load"), * that is, don't fetch lines from memory into the cache. This avoids pushing * precious packet data out of the cache hierarchy, and is appropriate when: * * - loading buckets from lookup tables, as they are not going to be used * again before packets are entirely classified * * - loading the result bitmap from the previous field, as it's never used * again */ #define NFT_PIPAPO_AVX2_LOAD(reg, loc) \ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc)) /* Stream a single lookup table bucket into YMM register given lookup table, * group index, value of packet bits, bucket size. */ #define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \ NFT_PIPAPO_AVX2_LOAD(reg, \ lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ (v)) * (bsize)]) #define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \ NFT_PIPAPO_AVX2_LOAD(reg, \ lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ (v)) * (bsize)]) /* Bitwise AND: the staple operation of this algorithm */ #define NFT_PIPAPO_AVX2_AND(dst, a, b) \ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst) /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load * operation, we don't bypass the cache here, as stored matching results * are always used shortly after. */ #define NFT_PIPAPO_AVX2_STORE(loc, reg) \ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)) /* Zero out a complete YMM register, @reg */ #define NFT_PIPAPO_AVX2_ZERO(reg) \ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) /** * nft_pipapo_avx2_prepare() - Prepare before main algorithm body * * This zeroes out ymm15, which is later used whenever we need to clear a * memory location, by storing its content into memory. */ static void nft_pipapo_avx2_prepare(void) { NFT_PIPAPO_AVX2_ZERO(15); } /** * nft_pipapo_avx2_fill() - Fill a bitmap region with ones * @data: Base memory area * @start: First bit to set * @len: Count of bits to fill * * This is nothing else than a version of bitmap_set(), as used e.g. by * pipapo_refill(), tailored for the microarchitectures using it and better * suited for the specific usage: it's very likely that we'll set a small number * of bits, not crossing a word boundary, and correct branch prediction is * critical here. * * This function doesn't actually use any AVX2 instruction. */ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) { int offset = start % BITS_PER_LONG; unsigned long mask; data += start / BITS_PER_LONG; if (likely(len == 1)) { *data |= BIT(offset); return; } if (likely(len < BITS_PER_LONG || offset)) { if (likely(len + offset <= BITS_PER_LONG)) { *data |= GENMASK(len - 1 + offset, offset); return; } *data |= ~0UL << offset; len -= BITS_PER_LONG - offset; data++; if (len <= BITS_PER_LONG) { mask = ~0UL >> (BITS_PER_LONG - len); *data |= mask; return; } } memset(data, 0xff, len / BITS_PER_BYTE); data += len / BITS_PER_LONG; len %= BITS_PER_LONG; if (len) *data |= ~0UL >> (BITS_PER_LONG - len); } /** * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits * @offset: Start from given bitmap (equivalent to bucket) offset, in longs * @map: Bitmap to be scanned for set bits * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers * @last: Return index of first set bit, if this is the last field * * This is an alternative implementation of pipapo_refill() suitable for usage * with AVX2 lookup routines: we know there are four words to be scanned, at * a given offset inside the map, for each matching iteration. * * This function doesn't actually use any AVX2 instruction. * * Return: first set bit index if @last, index of first filled word otherwise. */ static int nft_pipapo_avx2_refill(int offset, unsigned long *map, unsigned long *dst, union nft_pipapo_map_bucket *mt, bool last) { int ret = -1; #define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \ do { \ while (map[(x)]) { \ int r = __builtin_ctzl(map[(x)]); \ int i = (offset + (x)) * BITS_PER_LONG + r; \ \ if (last) \ return i; \ \ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \ \ if (ret == -1) \ ret = mt[i].to; \ \ map[(x)] &= ~(1UL << r); \ } \ } while (0) NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0); NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1); NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2); NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3); #undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD return ret; } /** * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * Load buckets from lookup table corresponding to the values of each 4-bit * group of packet bytes, and perform a bitwise intersection between them. If * this is the first field in the set, simply AND the buckets together * (equivalent to using an all-ones starting bitmap), use the provided starting * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next * working bitmap, @fill. * * This is used for 8-bit fields (i.e. protocol numbers). * * Out-of-order (and superscalar) execution is vital here, so it's critical to * avoid false data dependencies. CPU and compiler could (mostly) take care of * this on their own, but the operation ordering is explicitly given here with * a likely execution order in mind, to highlight possible stalls. That's why * a number of logically distinct operations (i.e. loading buckets, intersecting * buckets) are interleaved. * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing); NFT_PIPAPO_AVX2_AND(3, 0, 1); NFT_PIPAPO_AVX2_AND(4, 2, 3); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 16-bit fields (i.e. ports). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_AND(7, 4, 5); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_AND(7, 4, 5); /* Stall */ NFT_PIPAPO_AVX2_AND(7, 6, 7); } /* Stall */ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 7); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 32-bit fields (i.e. IPv4 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, }; int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(8, 2, 3); NFT_PIPAPO_AVX2_AND(9, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(11, 6, 7); NFT_PIPAPO_AVX2_AND(12, 8, 9); NFT_PIPAPO_AVX2_AND(13, 10, 11); /* Stall */ NFT_PIPAPO_AVX2_AND(1, 12, 13); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_AND(8, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(10, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(12, 6, 7); NFT_PIPAPO_AVX2_AND(13, 8, 9); NFT_PIPAPO_AVX2_AND(14, 10, 11); /* Stall */ NFT_PIPAPO_AVX2_AND(1, 12, 13); NFT_PIPAPO_AVX2_AND(1, 1, 14); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 1); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, }; int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (!first) NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); if (!first) { NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(1, 1, 0); } NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(9, 1, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(11, 5, 6); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize); NFT_PIPAPO_AVX2_AND(13, 7, 8); NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize); NFT_PIPAPO_AVX2_AND(0, 9, 10); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize); NFT_PIPAPO_AVX2_AND(2, 11, 12); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); NFT_PIPAPO_AVX2_AND(4, 13, 14); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_AND(6, 2, 3); /* Stalls */ NFT_PIPAPO_AVX2_AND(7, 4, 5); NFT_PIPAPO_AVX2_AND(8, 6, 7); NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 8); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 128-bit fields (i.e. IPv6 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf, pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf, pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf, pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf, pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf, }; int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (!first) NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); if (!first) { NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(1, 1, 0); } NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_AND(8, 1, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(10, 5, 6); NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(12, 7, 8); NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize); NFT_PIPAPO_AVX2_AND(14, 9, 10); NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize); NFT_PIPAPO_AVX2_AND(1, 11, 12); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); NFT_PIPAPO_AVX2_AND(4, 13, 14); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize); NFT_PIPAPO_AVX2_AND(7, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize); NFT_PIPAPO_AVX2_AND(9, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize); NFT_PIPAPO_AVX2_AND(11, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize); NFT_PIPAPO_AVX2_AND(13, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize); NFT_PIPAPO_AVX2_AND(0, 8, 9); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize); NFT_PIPAPO_AVX2_AND(2, 10, 11); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize); NFT_PIPAPO_AVX2_AND(4, 12, 13); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize); NFT_PIPAPO_AVX2_AND(6, 14, 0); NFT_PIPAPO_AVX2_AND(7, 1, 2); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize); NFT_PIPAPO_AVX2_AND(9, 3, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize); NFT_PIPAPO_AVX2_AND(11, 5, 6); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize); NFT_PIPAPO_AVX2_AND(13, 7, 8); NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize); NFT_PIPAPO_AVX2_AND(1, 9, 10); NFT_PIPAPO_AVX2_AND(2, 11, 12); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize); NFT_PIPAPO_AVX2_AND(4, 13, 14); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize); NFT_PIPAPO_AVX2_AND(6, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize); NFT_PIPAPO_AVX2_AND(9, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize); NFT_PIPAPO_AVX2_AND(11, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize); NFT_PIPAPO_AVX2_AND(0, 6, 7); NFT_PIPAPO_AVX2_AND(1, 8, 9); NFT_PIPAPO_AVX2_AND(2, 10, 11); NFT_PIPAPO_AVX2_AND(3, 12, 0); /* Stalls */ NFT_PIPAPO_AVX2_AND(4, 1, 2); NFT_PIPAPO_AVX2_AND(5, 3, 4); NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 5); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 8-bit fields (i.e. protocol numbers). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_AND(2, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 2); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 16-bit fields (i.e. ports). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); } else { NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); /* Stall */ NFT_PIPAPO_AVX2_AND(3, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(4, 3, 2); } /* Stall */ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 32-bit fields (i.e. IPv4 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); /* Stall */ NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_AND(0, 4, 5); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(6, 2, 3); /* Stall */ NFT_PIPAPO_AVX2_AND(7, 4, 5); NFT_PIPAPO_AVX2_AND(0, 6, 7); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 0); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(7, 2, 3); /* Stall */ NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_AND(1, 6, 7); NFT_PIPAPO_AVX2_AND(4, 0, 1); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); /* Stall */ NFT_PIPAPO_AVX2_AND(3, 0, 1); NFT_PIPAPO_AVX2_AND(4, 2, 3); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 128-bit fields (i.e. IPv6 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (!first) NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); if (!first) { NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(1, 1, 0); } NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(6, 1, 2); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(0, 3, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize); NFT_PIPAPO_AVX2_AND(3, 5, 6); NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize); NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); /* Stall */ NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_AND(6, 4, 5); NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 6); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes * @mdata: Matching data, including mapping table * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * This function should never be called, but is provided for the case the field * size doesn't match any of the known data types. Matching rate is * substantially lower than AVX2 routines. * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata, unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { unsigned long bsize = f->bsize; int i, ret = -1, b; if (first) pipapo_resmap_init(mdata, map); for (i = offset; i < bsize; i++) { if (f->bb == 8) pipapo_and_field_buckets_8bit(f, map, pkt); else pipapo_and_field_buckets_4bit(f, map, pkt); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last); if (last) return b; if (ret == -1) ret = b / XSAVE_YMM_SIZE; } return ret; } /** * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity * @desc: Set description, element count and field description used * @features: Flags: NFT_SET_INTERVAL needs to be there * @est: Storage for estimation data * * Return: true if set is compatible and AVX2 available, false otherwise. */ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (!(features & NFT_SET_INTERVAL) || desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) return false; est->size = pipapo_estimate_size(desc); if (!est->size) return false; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; return true; } /** * pipapo_resmap_init_avx2() - Initialise result map before first use * @m: Matching data, including mapping table * @res_map: Result map * * Like pipapo_resmap_init() but do not set start map bits covered by the first field. */ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) { const struct nft_pipapo_field *f = m->f; int i; /* Starting map doesn't need to be set to all-ones for this implementation, * but we do need to zero the remaining bits, if any. */ for (i = f->bsize; i < m->bsize_max; i++) res_map[i] = 0ul; } /** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * * Return: true on match, false otherwise. */ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; unsigned long *res, *fill; bool map_index; int i, ret = 0; local_bh_disable(); if (unlikely(!irq_fpu_usable())) { bool fallback_res = nft_pipapo_lookup(net, set, key, ext); local_bh_enable(); return fallback_res; } m = rcu_dereference(priv->match); /* This also protects access to all data related to scratch maps. * * Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ kernel_fpu_begin_mask(0); scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) { kernel_fpu_end(); local_bh_enable(); return false; } map_index = scratch->map_index; res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init_avx2(m, res); nft_pipapo_avx2_prepare(); next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ ret, rp, \ first, last)) if (likely(f->bb == 8)) { if (f->groups == 1) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1); } else if (f->groups == 2) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2); } else if (f->groups == 4) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4); } else if (f->groups == 6) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6); } else if (f->groups == 16) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } } else { if (f->groups == 2) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2); } else if (f->groups == 4) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4); } else if (f->groups == 8) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8); } else if (f->groups == 12) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12); } else if (f->groups == 32) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } } NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; #undef NFT_SET_PIPAPO_AVX2_LOOKUP if (ret < 0) goto out; if (last) { *ext = &f->mt[ret].e->ext; if (unlikely(nft_set_elem_expired(*ext) || !nft_set_elem_active(*ext, genmask))) { ret = 0; goto next_match; } goto out; } swap(res, fill); rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } out: if (i % 2) scratch->map_index = !map_index; kernel_fpu_end(); local_bh_enable(); return ret >= 0; }
106 106 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul_2.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* The loop counter and index J goes from -SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; x = res_ptr[j]; prod_low = x + prod_low; cy_limb += prod_low < x ? 1 : 0; res_ptr[j] = prod_low; } while (++j); return cy_limb; }
24 24 24 24 24 1294 69 24301 24303 14 59 2 57 11 2 9 223 158 14 56 69 114 11 30 24 65 136 197 134 1 1 1 46 1 46 2 1 1 42 136 125 46 123 88 6 5 14 115 27 29 24 104 20 29 2 111 11 124 22 22 6 20 4 1 18 17 18 16 222 1 221 140 91 222 200 193 30 107 24 88 58 55 107 200 141 152 199 5 107 27 27 19 2 20 20 16 20 21 3 2 5 41 1 39 2 17 22 21 3 9 8 2 36 40 23 17 17 23 120 120 9 120 119 120 24 24 23 14 1 13 28 26 13 24 5 24 13 24 22 116 115 15 15 102 3 211 208 96 69 135 202 2 6 4 192 88 28 88 116 1 4 4 88 115 167 12 175 69 106 123 11 16 94 292 181 4 116 25 5 114 2 2 5 2 4 3 245 6 220 4 215 15 1 1 13 10 10 10 8 8 8 13 221 221 34 5 26 34 34 7 4 37 37 1 35 1 36 36 5 1 28 7 44 44 53 53 38 7 35 9 54 54 31 34 252 229 36 251 27 4673 69 4674 4677 28 4668 4668 31 31 2 2 24 24 24 24 19 19 5 15277 114 15253 6 1 5 11 5 15 15 15 15 15 15 6 6 15 26 26 26 20 24482 1197 24223 15 1 19 19 1 19 34 24 15 187 14 1 24430 14 24415 2 97 98 24422 59 59 1185 5484 4434 4433 4431 14856 1167 22686 15098 15175 7 15114 530 7 4 7 12 12 4 12 12 12 12 12 12 12 12 7 12 2 5 4899 4912 7 12 6 4886 1694 1666 51 126 9 115 126 85 122 10 3 9 37 14 49 70 27 36 60 41 16 17 1115 1073 56 32 30 30 35 54 46 26 10 13 13 21 37 54 54 13 588 590 54 21 35 54 54 104 100 4 4 27 24 10 24 1 3 1 2 1 24 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support six policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * weighted interleave * Allocate memory interleaved over a set of nodes based on * a set of weights (per-node), with normal fallback if it * fails. Otherwise operates the same as interleave. * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated * on node 0 for every 1 page allocated on node 1. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include <linux/memory.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* * weightiness balances the tradeoff between small weights (cycles through nodes * faster, more fair/even distribution) and large weights (smaller errors * between actual bandwidth ratios and weight ratios). 32 is a number that has * been found to perform at a reasonable compromise between the two goals. */ static const int weightiness = 32; /* * A null weighted_interleave_state is interpreted as having .mode="auto", * and .iw_table is interpreted as an array of 1s with length nr_node_ids. */ struct weighted_interleave_state { bool mode_auto; u8 iw_table[]; }; static struct weighted_interleave_state __rcu *wi_state; static unsigned int *node_bw_table; /* * wi_state_lock protects both wi_state and node_bw_table. * node_bw_table is only used by writers to update wi_state. */ static DEFINE_MUTEX(wi_state_lock); static u8 get_il_weight(int node) { struct weighted_interleave_state *state; u8 weight = 1; rcu_read_lock(); state = rcu_dereference(wi_state); if (state) weight = state->iw_table[node]; rcu_read_unlock(); return weight; } /* * Convert bandwidth values into weighted interleave weights. * Call with wi_state_lock. */ static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) { u64 sum_bw = 0; unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; int nid; for_each_node_state(nid, N_MEMORY) sum_bw += bw[nid]; /* Scale bandwidths to whole numbers in the range [1, weightiness] */ for_each_node_state(nid, N_MEMORY) { /* * Try not to perform 64-bit division. * If sum_bw < scaling_factor, then sum_bw < U32_MAX. * If sum_bw > scaling_factor, then round the weight up to 1. */ scaling_factor = weightiness * bw[nid]; if (bw[nid] && sum_bw < scaling_factor) { cast_sum_bw = (unsigned int)sum_bw; new_iw[nid] = scaling_factor / cast_sum_bw; } else { new_iw[nid] = 1; } if (!iw_gcd) iw_gcd = new_iw[nid]; iw_gcd = gcd(iw_gcd, new_iw[nid]); } /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ for_each_node_state(nid, N_MEMORY) new_iw[nid] /= iw_gcd; } int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) { struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; unsigned int *old_bw, *new_bw; unsigned int bw_val; int i; bw_val = min(coords->read_bandwidth, coords->write_bandwidth); new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); if (!new_bw) return -ENOMEM; new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), GFP_KERNEL); if (!new_wi_state) { kfree(new_bw); return -ENOMEM; } new_wi_state->mode_auto = true; for (i = 0; i < nr_node_ids; i++) new_wi_state->iw_table[i] = 1; /* * Update bandwidth info, even in manual mode. That way, when switching * to auto mode in the future, iw_table can be overwritten using * accurate bw data. */ mutex_lock(&wi_state_lock); old_bw = node_bw_table; if (old_bw) memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); new_bw[node] = bw_val; node_bw_table = new_bw; old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (old_wi_state && !old_wi_state->mode_auto) { /* Manual mode; skip reducing weights and updating wi_state */ mutex_unlock(&wi_state_lock); kfree(new_wi_state); goto out; } /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ reduce_interleave_weights(new_bw, new_wi_state->iw_table); rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&wi_state_lock); if (old_wi_state) { synchronize_rcu(); kfree(old_wi_state); } out: kfree(old_bw); return 0; } /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); /** * nearest_node_nodemask - Find the node in @mask at the nearest distance * from @node. * * @node: a valid node ID to start the search from. * @mask: a pointer to a nodemask representing the allowed nodes. * * This function iterates over all nodes in @mask and calculates the * distance from the starting @node, then it returns the node ID that is * the closest to @node, or MAX_NUMNODES if no node is found. * * Note that @node must be a valid node ID usable with node_distance(), * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes * or unexpected behavior. */ int nearest_node_nodemask(int node, nodemask_t *mask) { int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; for_each_node_mask(n, *mask) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(nearest_node_nodemask); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, [MPOL_WEIGHTED_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pmd_folio(*pmd); if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { max_nr = (end - addr) >> PAGE_SHIFT; nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio) && max_nr != 1) nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed += nr; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_maybe_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); } tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; current->il_weight = 0; } task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else if (pol == current->mempolicy && pol->mode == MPOL_WEIGHTED_INTERLEAVE) { if (current->il_weight) *policy = current->il_prev; else *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_maybe_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, .reason = MR_SYSCALL, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); if (unlikely(!vma)) { mmap_read_unlock(mm); return 0; } /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE) { struct folio *folio; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(folio, &pagelist, lru) { if (!folio_test_ksm(folio)) break; } if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma(folio, folio_page(folio, 0), vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = folio_order(folio); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= folio->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) *flags |= (MPOL_F_MOF | MPOL_F_MORON); else return -EINVAL; } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { unsigned int node; unsigned int cpuset_mems_cookie; retry: /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ cpuset_mems_cookie = read_mems_allowed_begin(); node = current->il_prev; if (!current->il_weight || !node_isset(node, policy->nodes)) { node = next_node_in(node, policy->nodes); if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry; if (node == MAX_NUMNODES) return node; current->il_prev = node; current->il_weight = get_il_weight(node); } current->il_weight--; return node; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; unsigned int cpuset_mems_cookie; /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nid = next_node_in(current->il_prev, policy->nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_WEIGHTED_INTERLEAVE: return weighted_interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return zonelist_zone(z) ? zonelist_node_idx(z) : node; } case MPOL_LOCAL: return node; default: BUG(); } } static unsigned int read_once_policy_nodemask(struct mempolicy *pol, nodemask_t *mask) { /* * barrier stabilizes the nodemask locally so that it can be iterated * over safely without concern for changes. Allocators validate node * selection does not violate mems_allowed, so this is safe. */ barrier(); memcpy(mask, &pol->nodes, sizeof(nodemask_t)); barrier(); return nodes_weight(*mask); } static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { struct weighted_interleave_state *state; nodemask_t nodemask; unsigned int target, nr_nodes; u8 *table = NULL; unsigned int weight_total = 0; u8 weight; int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); state = rcu_dereference(wi_state); /* Uninitialized wi_state means we should assume all weights are 1 */ if (state) table = state->iw_table; /* calculate the total weight */ for_each_node_mask(nid, nodemask) weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; nid = first_node(nodemask); while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; if (target < weight) break; target -= weight; nid = next_node_in(nid, nodemask); } rcu_read_unlock(); return nid; } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: *nid = (ilx == NO_INTERLEAVE_INDEX) ? weighted_interleave_nodes(pol) : weighted_interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_frozen_pages_noprof( gfp | __GFP_THISNODE | __GFP_NORETRY, order, nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, nid); if (!page) return NULL; set_page_refcounted(page); return page_rmappable_folio(page); } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct folio *folio; if (vma->vm_flags & VM_DROPPABLE) gfp |= __GFP_NOWARN; pol = get_vma_policy(vma, addr, order, &ilx); folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return folio; } EXPORT_SYMBOL(vma_alloc_folio_noprof); struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_frozen_pages_noprof(gfp, order); if (page) set_page_refcounted(page); return page; } EXPORT_SYMBOL(alloc_pages_noprof); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; int nnodes, node; int resume_node = MAX_NUMNODES - 1; u8 resume_weight = 0; int prev_node; int i; if (!nr_pages) return 0; /* read the nodes onto the stack, retry if done during rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nnodes = read_once_policy_nodemask(pol, &nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); /* if the nodemask has become invalid, we cannot do anything */ if (!nnodes) return 0; /* Continue allocating from most recent node and adjust the nr_pages */ node = me->il_prev; weight = me->il_weight; if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ if (rem_pages <= weight) { me->il_weight -= rem_pages; return total_allocated; } /* Otherwise we adjust remaining pages, continue from there */ rem_pages -= weight; } /* clear active weight in case of an allocation failure */ me->il_weight = 0; prev_node = node; /* create a local copy of node weights to operate on outside rcu */ weights = kzalloc(nr_node_ids, GFP_KERNEL); if (!weights) return total_allocated; rcu_read_lock(); state = rcu_dereference(wi_state); if (state) { memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); rcu_read_unlock(); } else { rcu_read_unlock(); for (i = 0; i < nr_node_ids; i++) weights[i] = 1; } /* calculate total, detect system default usage */ for_each_node_mask(node, nodes) weight_total += weights[node]; /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. * Track which node weighted interleave should resume from. * * if (rounds > 0) and (delta == 0), resume_node will always be * the node following prev_node and its weight. */ rounds = rem_pages / weight_total; delta = rem_pages % weight_total; resume_node = next_node_in(prev_node, nodes); resume_weight = weights[resume_node]; for (i = 0; i < nnodes; i++) { node = next_node_in(prev_node, nodes); weight = weights[node]; node_pages = weight * rounds; /* If a delta exists, add this node's portion of the delta */ if (delta > weight) { node_pages += weight; delta -= weight; } else if (delta) { /* when delta is depleted, resume from that node */ node_pages += delta; resume_node = node; resume_weight = weight - delta; delta = 0; } /* node_pages can be 0 if an allocation fails and rounds == 0 */ if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) break; prev_node = node; } me->il_prev = resume_node; me->il_weight = resume_weight; kfree(weights); return total_allocated; } static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; /* * Make sure ptl is held so that we don't preempt and we * have a stable smp processor id */ lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: case MPOL_PREFERRED_MANY: /* * Even though MPOL_PREFERRED_MANY can allocate pages outside * policy nodemask we don't allow numa migration to nodes * outside policy nodemask for now. This is done so that if we * want demotion to slow memory to happen, before allocating * from some DRAM node say 'x', we will end up using a * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario * we should not promote to node 'x' from slow memory node. */ if (pol->flags & MPOL_F_MORON) { /* * Optimize placement among multiple nodes * via NUMA balancing */ if (node_isset(thisnid, pol->nodes)) break; goto out; } /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zonelist_node_idx(z); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 51 for the longest mode, "weighted * interleave", plus the longest flag flags, "relative|balancing", and to * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol >= &preferred_node_policy[0] && pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); if (flags & MPOL_F_NUMA_BALANCING) { if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) p += snprintf(p, buffer + maxlen - p, "|"); p += snprintf(p, buffer + maxlen - p, "balancing"); } } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } #ifdef CONFIG_SYSFS struct iw_node_attr { struct kobj_attribute kobj_attr; int nid; }; struct sysfs_wi_group { struct kobject wi_kobj; struct mutex kobj_lock; struct iw_node_attr *nattrs[]; }; static struct sysfs_wi_group *wi_group; static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct iw_node_attr *node_attr; u8 weight; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); weight = get_il_weight(node_attr->nid); return sysfs_emit(buf, "%d\n", weight); } static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; u8 weight = 0; int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); if (count == 0 || sysfs_streq(buf, "") || kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), GFP_KERNEL); if (!new_wi_state) return -ENOMEM; mutex_lock(&wi_state_lock); old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (old_wi_state) { memcpy(new_wi_state->iw_table, old_wi_state->iw_table, nr_node_ids * sizeof(u8)); } else { for (i = 0; i < nr_node_ids; i++) new_wi_state->iw_table[i] = 1; } new_wi_state->iw_table[node_attr->nid] = weight; new_wi_state->mode_auto = false; rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&wi_state_lock); if (old_wi_state) { synchronize_rcu(); kfree(old_wi_state); } return count; } static ssize_t weighted_interleave_auto_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct weighted_interleave_state *state; bool wi_auto = true; rcu_read_lock(); state = rcu_dereference(wi_state); if (state) wi_auto = state->mode_auto; rcu_read_unlock(); return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); } static ssize_t weighted_interleave_auto_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; unsigned int *bw; bool input; int i; if (kstrtobool(buf, &input)) return -EINVAL; new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), GFP_KERNEL); if (!new_wi_state) return -ENOMEM; for (i = 0; i < nr_node_ids; i++) new_wi_state->iw_table[i] = 1; mutex_lock(&wi_state_lock); if (!input) { old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (!old_wi_state) goto update_wi_state; if (input == old_wi_state->mode_auto) { mutex_unlock(&wi_state_lock); return count; } memcpy(new_wi_state->iw_table, old_wi_state->iw_table, nr_node_ids * sizeof(u8)); goto update_wi_state; } bw = node_bw_table; if (!bw) { mutex_unlock(&wi_state_lock); kfree(new_wi_state); return -ENODEV; } new_wi_state->mode_auto = true; reduce_interleave_weights(bw, new_wi_state->iw_table); update_wi_state: rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&wi_state_lock); if (old_wi_state) { synchronize_rcu(); kfree(old_wi_state); } return count; } static void sysfs_wi_node_delete(int nid) { struct iw_node_attr *attr; if (nid < 0 || nid >= nr_node_ids) return; mutex_lock(&wi_group->kobj_lock); attr = wi_group->nattrs[nid]; if (!attr) { mutex_unlock(&wi_group->kobj_lock); return; } wi_group->nattrs[nid] = NULL; mutex_unlock(&wi_group->kobj_lock); sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); kfree(attr->kobj_attr.attr.name); kfree(attr); } static void sysfs_wi_node_delete_all(void) { int nid; for (nid = 0; nid < nr_node_ids; nid++) sysfs_wi_node_delete(nid); } static void wi_state_free(void) { struct weighted_interleave_state *old_wi_state; mutex_lock(&wi_state_lock); old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (!old_wi_state) { mutex_unlock(&wi_state_lock); return; } rcu_assign_pointer(wi_state, NULL); mutex_unlock(&wi_state_lock); synchronize_rcu(); kfree(old_wi_state); } static struct kobj_attribute wi_auto_attr = __ATTR(auto, 0664, weighted_interleave_auto_show, weighted_interleave_auto_store); static void wi_cleanup(void) { sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); sysfs_wi_node_delete_all(); wi_state_free(); } static void wi_kobj_release(struct kobject *wi_kobj) { kfree(wi_group); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = wi_kobj_release, }; static int sysfs_wi_node_add(int nid) { int ret; char *name; struct iw_node_attr *new_attr; if (nid < 0 || nid >= nr_node_ids) { pr_err("invalid node id: %d\n", nid); return -EINVAL; } new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { kfree(new_attr); return -ENOMEM; } sysfs_attr_init(&new_attr->kobj_attr.attr); new_attr->kobj_attr.attr.name = name; new_attr->kobj_attr.attr.mode = 0644; new_attr->kobj_attr.show = node_show; new_attr->kobj_attr.store = node_store; new_attr->nid = nid; mutex_lock(&wi_group->kobj_lock); if (wi_group->nattrs[nid]) { mutex_unlock(&wi_group->kobj_lock); ret = -EEXIST; goto out; } ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); if (ret) { mutex_unlock(&wi_group->kobj_lock); goto out; } wi_group->nattrs[nid] = new_attr; mutex_unlock(&wi_group->kobj_lock); return 0; out: kfree(new_attr->kobj_attr.attr.name); kfree(new_attr); return ret; } static int wi_node_notifier(struct notifier_block *nb, unsigned long action, void *data) { int err; struct memory_notify *arg = data; int nid = arg->status_change_nid; if (nid < 0) return NOTIFY_OK; switch (action) { case MEM_ONLINE: err = sysfs_wi_node_add(nid); if (err) pr_err("failed to add sysfs for node%d during hotplug: %d\n", nid, err); break; case MEM_OFFLINE: sysfs_wi_node_delete(nid); break; } return NOTIFY_OK; } static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) { int nid, err; wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), GFP_KERNEL); if (!wi_group) return -ENOMEM; mutex_init(&wi_group->kobj_lock); err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); if (err) goto err_put_kobj; err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); if (err) goto err_put_kobj; for_each_online_node(nid) { if (!node_state(nid, N_MEMORY)) continue; err = sysfs_wi_node_add(nid); if (err) { pr_err("failed to add sysfs for node%d during init: %d\n", nid, err); goto err_cleanup_kobj; } } hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); return 0; err_cleanup_kobj: wi_cleanup(); kobject_del(&wi_group->wi_kobj); err_put_kobj: kobject_put(&wi_group->wi_kobj); return err; } static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); if (!mempolicy_kobj) return -ENOMEM; err = add_weighted_interleave_group(mempolicy_kobj); if (err) goto err_kobj; return 0; err_kobj: kobject_del(mempolicy_kobj); kobject_put(mempolicy_kobj); return err; } late_initcall(mempolicy_sysfs_init); #endif /* CONFIG_SYSFS */
1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 // SPDX-License-Identifier: GPL-2.0-only /* * Driver for RobotFuzz OSIF * * Copyright (c) 2013 Andrew Lunn <andrew@lunn.ch> * Copyright (c) 2007 Barry Carter <Barry.Carter@robotfuzz.com> * * Based on the i2c-tiny-usb by * * Copyright (C) 2006 Til Harbaum (Till@Harbaum.org) */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/i2c.h> #include <linux/slab.h> #include <linux/usb.h> #define OSIFI2C_READ 20 #define OSIFI2C_WRITE 21 #define OSIFI2C_STOP 22 #define OSIFI2C_STATUS 23 #define OSIFI2C_SET_BIT_RATE 24 #define STATUS_ADDRESS_ACK 0 #define STATUS_ADDRESS_NAK 2 struct osif_priv { struct usb_device *usb_dev; struct usb_interface *interface; struct i2c_adapter adapter; unsigned char status; }; static int osif_usb_read(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct osif_priv *priv = adapter->algo_data; return usb_control_msg(priv->usb_dev, usb_rcvctrlpipe(priv->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_IN, value, index, data, len, 2000); } static int osif_usb_write(struct i2c_adapter *adapter, int cmd, int value, int index, void *data, int len) { struct osif_priv *priv = adapter->algo_data; return usb_control_msg(priv->usb_dev, usb_sndctrlpipe(priv->usb_dev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_INTERFACE, value, index, data, len, 2000); } static int osif_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs, int num) { struct osif_priv *priv = adapter->algo_data; struct i2c_msg *pmsg; int ret; int i; for (i = 0; i < num; i++) { pmsg = &msgs[i]; if (pmsg->flags & I2C_M_RD) { ret = osif_usb_read(adapter, OSIFI2C_READ, pmsg->flags, pmsg->addr, pmsg->buf, pmsg->len); if (ret != pmsg->len) { dev_err(&adapter->dev, "failure reading data\n"); return -EREMOTEIO; } } else { ret = osif_usb_write(adapter, OSIFI2C_WRITE, pmsg->flags, pmsg->addr, pmsg->buf, pmsg->len); if (ret != pmsg->len) { dev_err(&adapter->dev, "failure writing data\n"); return -EREMOTEIO; } } ret = osif_usb_write(adapter, OSIFI2C_STOP, 0, 0, NULL, 0); if (ret) { dev_err(&adapter->dev, "failure sending STOP\n"); return -EREMOTEIO; } /* read status */ ret = osif_usb_read(adapter, OSIFI2C_STATUS, 0, 0, &priv->status, 1); if (ret != 1) { dev_err(&adapter->dev, "failure reading status\n"); return -EREMOTEIO; } if (priv->status != STATUS_ADDRESS_ACK) { dev_dbg(&adapter->dev, "status = %d\n", priv->status); return -EREMOTEIO; } } return i; } static u32 osif_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; } static const struct i2c_algorithm osif_algorithm = { .xfer = osif_xfer, .functionality = osif_func, }; #define USB_OSIF_VENDOR_ID 0x1964 #define USB_OSIF_PRODUCT_ID 0x0001 static const struct usb_device_id osif_table[] = { { USB_DEVICE(USB_OSIF_VENDOR_ID, USB_OSIF_PRODUCT_ID) }, { } }; MODULE_DEVICE_TABLE(usb, osif_table); static int osif_probe(struct usb_interface *interface, const struct usb_device_id *id) { int ret; struct osif_priv *priv; u16 version; priv = devm_kzalloc(&interface->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; priv->usb_dev = usb_get_dev(interface_to_usbdev(interface)); priv->interface = interface; usb_set_intfdata(interface, priv); priv->adapter.owner = THIS_MODULE; priv->adapter.class = I2C_CLASS_HWMON; priv->adapter.algo = &osif_algorithm; priv->adapter.algo_data = priv; snprintf(priv->adapter.name, sizeof(priv->adapter.name), "OSIF at bus %03d device %03d", priv->usb_dev->bus->busnum, priv->usb_dev->devnum); /* * Set bus frequency. The frequency is: * 120,000,000 / ( 16 + 2 * div * 4^prescale). * Using dev = 52, prescale = 0 give 100KHz */ ret = osif_usb_write(&priv->adapter, OSIFI2C_SET_BIT_RATE, 52, 0, NULL, 0); if (ret) { dev_err(&interface->dev, "failure sending bit rate"); usb_put_dev(priv->usb_dev); return ret; } i2c_add_adapter(&(priv->adapter)); version = le16_to_cpu(priv->usb_dev->descriptor.bcdDevice); dev_info(&interface->dev, "version %x.%02x found at bus %03d address %03d", version >> 8, version & 0xff, priv->usb_dev->bus->busnum, priv->usb_dev->devnum); return 0; } static void osif_disconnect(struct usb_interface *interface) { struct osif_priv *priv = usb_get_intfdata(interface); i2c_del_adapter(&(priv->adapter)); usb_set_intfdata(interface, NULL); usb_put_dev(priv->usb_dev); } static struct usb_driver osif_driver = { .name = "RobotFuzz Open Source InterFace, OSIF", .probe = osif_probe, .disconnect = osif_disconnect, .id_table = osif_table, }; module_usb_driver(osif_driver); MODULE_AUTHOR("Andrew Lunn <andrew@lunn.ch>"); MODULE_AUTHOR("Barry Carter <barry.carter@robotfuzz.com>"); MODULE_DESCRIPTION("RobotFuzz OSIF driver"); MODULE_LICENSE("GPL v2");
974 496 5 976 974 974 976 496 5 976 940 404 570 938 919 920 918 19 3 386 566 918 976 971 500 975 976 499 2 796 796 793 793 792 792 796 796 793 487 428 44 90 703 225 201 442 446 452 376 136 36 36 35 36 31 36 31 30 31 28 10 10 10 140 140 1 139 10 133 30 2 30 137 136 1 72 1 71 22 22 1144 2 1013 164 164 1146 649 649 649 651 1 952 955 710 468 953 952 287 274 15 647 647 417 370 651 650 651 16 16 5 916 14 448 1090 135 135 387 387 542 542 541 107 375 1 48 437 480 48 473 473 473 8 479 479 1 1 1 1 754 753 21 755 66 751 864 866 481 195 756 757 66 756 172 752 791 794 556 438 796 797 795 120 728 793 796 795 1093 1094 794 797 79 797 918 437 1094 795 797 78 1094 797 309 324 323 28 19 53 54 49 5 5 5 3 1 3 5 1 1 28 16 14 27 28 22 16 14 1371 1368 1372 541 540 540 4 540 539 7 41 42 44 44 5 4 1 2 5 2 22 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_generic.c Generic packet scheduler routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 * - Ingress support */ #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <linux/bpf.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/hotdata.h> #include <trace/events/qdisc.h> #include <trace/events/net.h> #include <net/xfrm.h> /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; EXPORT_SYMBOL(default_qdisc_ops); static void qdisc_maybe_clear_missed(struct Qdisc *q, const struct netdev_queue *txq) { clear_bit(__QDISC_STATE_MISSED, &q->state); /* Make sure the below netif_xmit_frozen_or_stopped() * checking happens after clearing STATE_MISSED. */ smp_mb__after_atomic(); /* Checking netif_xmit_frozen_or_stopped() again to * make sure STATE_MISSED is set if the STATE_MISSED * set by netif_tx_wake_queue()'s rescheduling of * net_tx_action() is cleared by the above clear_bit(). */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); else set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with * qdisc_lock(qdisc) spinlock. * * The idea is the following: * - enqueue, dequeue are serialized via qdisc root lock * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) { const struct netdev_queue *txq = q->dev_queue; spinlock_t *lock = NULL; struct sk_buff *skb; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->skb_bad_txq); if (skb) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = SKB_XOFF_MAGIC; qdisc_maybe_clear_missed(q, txq); } } if (lock) spin_unlock(lock); return skb; } static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) { struct sk_buff *skb = skb_peek(&q->skb_bad_txq); if (unlikely(skb)) skb = __skb_dequeue_bad_txq(q); return skb; } static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, struct sk_buff *skb) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } __skb_queue_tail(&q->skb_bad_txq, skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } if (lock) spin_unlock(lock); } static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); /* it's still part of the queue */ if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_requeues_inc(q); qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } skb = next; } if (lock) { spin_unlock(lock); set_bit(__QDISC_STATE_MISSED, &q->state); } else { __netif_schedule(q); } } static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, int *packets) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); if (!nskb) break; bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; (*packets)++; /* GSO counts as one pkt */ } skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure * all skbs in the chain are for the same txq */ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, struct sk_buff *skb, int *packets) { int mapping = skb_get_queue_mapping(skb); struct sk_buff *nskb; int cnt = 0; do { nskb = q->dequeue(q); if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { qdisc_enqueue_skb_bad_txq(q, nskb); break; } skb->next = nskb; skb = nskb; } while (++cnt < 8); (*packets) += cnt; skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; *packets = 1; if (unlikely(!skb_queue_empty(&q->gso_skb))) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->gso_skb); /* skb may be null if another cpu pulls gso_skb off in between * empty check and lock. */ if (!skb) { if (lock) spin_unlock(lock); goto validate; } /* skb in gso_skb were already validated */ *validate = false; if (xfrm_offload(skb)) *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = NULL; qdisc_maybe_clear_missed(q, txq); } if (lock) spin_unlock(lock); goto trace; } validate: *validate = true; if ((q->flags & TCQ_F_ONETXQUEUE) && netif_xmit_frozen_or_stopped(txq)) { qdisc_maybe_clear_missed(q, txq); return skb; } skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) { if (skb == SKB_XOFF_MAGIC) return NULL; goto bulk; } skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) try_bulk_dequeue_skb(q, skb, txq, packets); else try_bulk_dequeue_skb_slow(q, skb, packets); } trace: trace_qdisc_dequeue(q, txq, *packets, skb); return skb; } /* * Transmit possibly several skbs, and handle the return status as * required. Owning qdisc running bit guarantees that only one CPU * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff * true - feel free to send more pkts */ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; bool again = false; /* And release qdisc */ if (root_lock) spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) skb = validate_xmit_skb_list(skb, dev, &again); #ifdef CONFIG_XFRM_OFFLOAD if (unlikely(again)) { if (root_lock) spin_lock(root_lock); dev_requeue_skb(skb, q); return false; } #endif if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); else qdisc_maybe_clear_missed(q, txq); HARD_TX_UNLOCK(dev, txq); } else { if (root_lock) spin_lock(root_lock); return true; } if (root_lock) spin_lock(root_lock); if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); dev_requeue_skb(skb, q); return false; } return true; } /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * * netif_tx_lock serializes accesses to device driver. * * qdisc_lock(q) and netif_tx_lock are mutually exclusive, * if one is grabbed, another must be free. * * Note, that this procedure can be called by a watchdog timer * * Returns to the caller: * 0 - queue is empty or throttled. * >0 - queue is not empty. * */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) return false; if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) { int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) set_bit(__QDISC_STATE_MISSED, &q->state); else __netif_schedule(q); break; } } } unsigned long dev_trans_start(struct net_device *dev) { unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); unsigned long val; unsigned int i; for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) res = val; } return res; } EXPORT_SYMBOL(dev_trans_start); static void netif_freeze_queues(struct net_device *dev) { unsigned int i; int cpu; cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* We are the only thread of execution doing a * freeze, but we have to grab the _xmit_lock in * order to synchronize with threads which are in * the ->hard_start_xmit() handler and already * checked the frozen bit. */ __netif_tx_lock(txq, cpu); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } } void netif_tx_lock(struct net_device *dev) { spin_lock(&dev->tx_global_lock); netif_freeze_queues(dev); } EXPORT_SYMBOL(netif_tx_lock); static void netif_unfreeze_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* No need to grab the _xmit_lock here. If the * queue is not stopped for another reason, we * force a schedule. */ clear_bit(__QUEUE_STATE_FROZEN, &txq->state); netif_schedule_queue(txq); } } void netif_tx_unlock(struct net_device *dev) { netif_unfreeze_queues(dev); spin_unlock(&dev->tx_global_lock); } EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { struct net_device *dev = timer_container_of(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); if (!qdisc_tx_is_noop(dev)) { if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) { unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); if (!netif_xmit_stopped(txq)) continue; /* Paired with WRITE_ONCE() + smp_mb...() in * netdev_tx_sent_queue() and netif_tx_stop_queue(). */ smp_mb(); trans_start = READ_ONCE(txq->trans_start); if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } if (time_after(oldest_start, trans_start)) oldest_start = trans_start; } if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", raw_smp_processor_id(), i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } } spin_unlock(&dev->tx_global_lock); if (release) netdev_put(dev, &dev->watchdog_dev_tracker); } void netdev_watchdog_up(struct net_device *dev) { if (!dev->netdev_ops->ndo_tx_timeout) return; if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) netdev_hold(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(netdev_watchdog_up); static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (timer_delete(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } /** * netif_carrier_on - set carrier * @dev: network device * * Device has detected acquisition of carrier. */ void netif_carrier_on(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); /** * netif_carrier_off - clear carrier * @dev: network device * * Device has detected loss of carrier. */ void netif_carrier_off(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } EXPORT_SYMBOL(netif_carrier_off); /** * netif_carrier_event - report carrier state event * @dev: network device * * Device has detected a carrier event but the carrier state wasn't changed. * Use in drivers when querying carrier state asynchronously, to avoid missing * events (link flaps) if link recovers before it's queried. */ void netif_carrier_event(struct net_device *dev) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } EXPORT_SYMBOL_GPL(netif_carrier_event); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; static struct netdev_queue noop_netdev_queue = { RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc), }; struct Qdisc noop_qdisc = { .enqueue = noop_enqueue, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), }, .skb_bad_txq = { .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { /* register_qdisc() assigns a default of noop_enqueue if unset, * but __dev_queue_xmit() treats noqueue only as such * if this is NULL - so clear it here. */ qdisc->enqueue = NULL; return 0; } struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; EXPORT_SYMBOL(sch_default_prio2band); /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. */ #define PFIFO_FAST_BANDS 3 /* * Private data for a pfifo_fast scheduler containing: * - rings for priority bands */ struct pfifo_fast_priv { struct skb_array q[PFIFO_FAST_BANDS]; }; static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, int band) { return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct skb_array *q = band2list(priv, band); unsigned int pkt_len = qdisc_pkt_len(skb); int err; err = skb_array_produce(q, skb); if (unlikely(err)) { if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else return qdisc_drop(skb, qdisc, to_free); } qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; bool need_retry = true; int band; retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); if (__skb_array_empty(q)) continue; skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. */ smp_mb__after_atomic(); need_retry = false; goto retry; } return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; int band; for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); skb = __skb_array_peek(q); } return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (band = 0; band < PFIFO_FAST_BANDS; band++) { struct skb_array *q = band2list(priv, band); struct sk_buff *skb; /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; while ((skb = __skb_array_consume(q)) != NULL) kfree_skb(skb); } if (qdisc_is_percpu_stats(qdisc)) { for_each_possible_cpu(i) { struct gnet_stats_queue *q; q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; q->qlen = 0; } } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int prio; /* guard against zero length rings */ if (!qlen) return -EINVAL; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); int err; err = skb_array_init(q, qlen, GFP_KERNEL); if (err) return -ENOMEM; } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } static void pfifo_fast_destroy(struct Qdisc *sch) { struct pfifo_fast_priv *priv = qdisc_priv(sch); int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; /* Destroy ring but no need to kfree_skb because a call to * pfifo_fast_reset() has already done that work. */ ptr_ring_cleanup(&q->ring, NULL); } } static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, unsigned int new_len) { struct pfifo_fast_priv *priv = qdisc_priv(sch); struct skb_array *bands[PFIFO_FAST_BANDS]; int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); bands[prio] = q; } return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .change_tx_queue_len = pfifo_fast_change_tx_queue_len, .owner = THIS_MODULE, .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) { struct Qdisc *sch; unsigned int size = sizeof(*sch) + ops->priv_size; int err = -ENOBUFS; struct net_device *dev; if (!dev_queue) { NL_SET_ERR_MSG(extack, "No device queue given"); err = -EINVAL; goto errout; } dev = dev_queue->dev; sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); if (!sch) goto errout; __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!sch->cpu_qstats) { free_percpu(sch->cpu_bstats); goto errout1; } } spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, unsigned int parentid, struct netlink_ext_ack *extack) { struct Qdisc *sch; if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL, extack) == 0) { trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; } qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); /* Under qdisc_lock(qdisc) and BH! */ void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; trace_qdisc_reset(qdisc); if (ops->reset) ops->reset(qdisc); __skb_queue_purge(&qdisc->gso_skb); __skb_queue_purge(&qdisc->skb_bad_txq); qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); } kfree(qdisc); } static void qdisc_free_cb(struct rcu_head *head) { struct Qdisc *q = container_of(head, struct Qdisc, rcu); qdisc_free(q); } static void __qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct net_device *dev = qdisc_dev(qdisc); #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); qdisc_reset(qdisc); if (ops->destroy) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } void qdisc_destroy(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; __qdisc_destroy(qdisc); } void qdisc_put(struct Qdisc *qdisc) { if (!qdisc) return; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); } EXPORT_SYMBOL(qdisc_put); /* Version of qdisc_put() that is called with rtnl mutex unlocked. * Intended to be used as optimization, this function only takes rtnl lock if * qdisc reference counter reached zero. */ void qdisc_put_unlocked(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); rtnl_unlock(); } EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc) { struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping); spinlock_t *root_lock; root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); spin_unlock_bh(root_lock); return oqdisc; } EXPORT_SYMBOL(dev_graft_qdisc); static void shutdown_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { rcu_assign_pointer(dev_queue->qdisc, qdisc_default); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default); qdisc_put(qdisc); } } static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; const struct Qdisc_ops *ops = default_qdisc_ops; if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; else if(dev->type == ARPHRD_CAN) ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) return; if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } static void attach_default_qdiscs(struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED if (qdisc != &noop_qdisc) qdisc_hash_add(qdisc, false); #endif } static void transition_one_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_need_watchdog) { struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); int *need_watchdog_p = _need_watchdog; if (!(new_qdisc->flags & TCQ_F_BUILTIN)) clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p) { WRITE_ONCE(dev_queue->trans_start, 0); *need_watchdog_p = 1; } } void dev_activate(struct net_device *dev) { int need_watchdog; /* No queueing discipline is attached to device; * create default one for devices, which need queueing * and noqueue_qdisc for virtual interfaces */ if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) /* Delay activation until next carrier-on event */ return; need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); if (dev_ingress_queue(dev)) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { netif_trans_update(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); static void qdisc_deactivate(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); } static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_sync_needed) { bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { if (qdisc->enqueue) *sync_needed = true; qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } static void dev_reset_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; bool nolock; qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); if (!qdisc) return; nolock = qdisc->flags & TCQ_F_NOLOCK; if (nolock) spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; spinlock_t *root_lock; struct Qdisc *q; int val; dev_queue = netdev_get_tx_queue(dev, i); q = rtnl_dereference(dev_queue->qdisc_sleeping); root_lock = qdisc_lock(q); spin_lock_bh(root_lock); val = (qdisc_is_running(q) || test_bit(__QDISC_STATE_SCHED, &q->state)); spin_unlock_bh(root_lock); if (val) return true; } return false; } /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate * * This function returns only when all outstanding transmissions * have completed, unless all devices are in dismantle phase. */ void dev_deactivate_many(struct list_head *head) { bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), &sync_needed); netdev_watchdog_down(dev); } /* Wait for outstanding qdisc enqueuing calls. */ if (sync_needed) synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); if (dev_ingress_queue(dev)) dev_reset_queue(dev, dev_ingress_queue(dev), NULL); } /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) { /* wait_event() would avoid this sleep-loop but would * require expensive checks in the fast paths of packet * processing which isn't worth it. */ schedule_timeout_uninterruptible(1); } } } void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } EXPORT_SYMBOL(dev_deactivate); static int qdisc_change_tx_queue_len(struct net_device *dev, struct netdev_queue *dev_queue) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); const struct Qdisc_ops *ops = qdisc->ops; if (ops->change_tx_queue_len) return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); return 0; } void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); } void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) { #ifdef CONFIG_NET_SCHED struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int i; for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); /* Only update the default qdiscs we created, * qdiscs with handles are always hashed. */ if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_del(qdisc); } for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_add(qdisc, false); } #endif } EXPORT_SYMBOL(mq_change_real_num_tx); int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; unsigned int i; int ret = 0; if (up) dev_deactivate(dev); for (i = 0; i < dev->num_tx_queues; i++) { ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); /* TODO: revert changes on a partial failure */ if (ret) break; } if (up) dev_activate(dev); return ret; } static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc) { struct Qdisc *qdisc = _qdisc; rcu_assign_pointer(dev_queue->qdisc, qdisc); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } void dev_init_scheduler(struct net_device *dev) { rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); qdisc_put(rtnl_dereference(dev->qdisc)); rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } /** * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division * @rate: Rate to compute reciprocal division values of * @mult: Multiplier for reciprocal division * @shift: Shift for reciprocal division * * The multiplier and shift for reciprocal division by rate are stored * in mult and shift. * * The deal here is to replace a divide by a reciprocal one * in fast path (a reciprocal divide is a multiply and a shift) * * Normal formula would be : * time_in_ns = (NSEC_PER_SEC * len) / rate_bps * * We compute mult/shift to use instead : * time_in_ns = (len * mult) >> shift; * * We try to get the highest possible mult value for accuracy, * but have to make sure no overflows will ever happen. * * reciprocal_value() is not used here it doesn't handle 64-bit values. */ static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) { u64 factor = NSEC_PER_SEC; *mult = 1; *shift = 0; if (rate <= 0) return; for (;;) { *mult = div64_u64(factor, rate); if (*mult & (1U << 31) || factor & (1ULL << 63)) break; factor <<= 1; (*shift)++; } } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) { r->rate_pkts_ps = pktrate64; psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ppscfg_precompute); void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { /* Protected with chain0->filter_chain_lock. * Can't access chain directly because tp_head can be NULL. */ struct mini_Qdisc *miniq_old = rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); } else { miniq = miniq_old != &miniqp->miniq1 ? &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq * we are about to modify. So ensure that at least one RCU * grace period has elapsed since the miniq was made * inactive. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) cond_synchronize_rcu(miniq->rcu_state); else if (!poll_state_synchronize_rcu(miniq->rcu_state)) synchronize_rcu_expedited(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); } if (miniq_old) /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block) { miniqp->miniq1.block = block; miniqp->miniq2.block = block; } EXPORT_SYMBOL(mini_qdisc_pair_block_init); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq) { miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init);
2 1 141 539 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. */ #ifndef IB_ADDR_H #define IB_ADDR_H #include <linux/ethtool.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/socket.h> #include <linux/if_vlan.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/ip.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> #include <net/net_namespace.h> /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. * @dst_dev_addr: Destination MAC address. * @broadcast: Broadcast address of the device. * @dev_type: The interface hardware type of the device. * @bound_dev_if: An optional device interface index. * @transport: The transport type used. * @net: Network namespace containing the bound_dev_if net_dev. * @sgid_attr: GID attribute to use for identified SGID */ struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; struct net *net; const struct ib_gid_attr *sgid_attr; enum rdma_network_type network; int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. * * The dev_addr->net field must be initialized. */ int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff; } static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: ipv6_addr_set_v4mapped(((struct sockaddr_in *) addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: *(struct in6_addr *)&gid->raw = ((struct sockaddr_in6 *)addr)->sin6_addr; break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); } } /* * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp. * They are not applicable to RoCE. * RoCE GIDs are derived from the IP addresses. */ static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(*gid)); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * Reduce IB headers from effective IBoE MTU. */ mtu = mtu - (IB_GRH_BYTES + IB_UDP_BYTES + IB_BTH_BYTES + IB_EXT_XRC_BYTES + IB_EXT_ATOMICETH_BYTES + IB_ICRC_BYTES); if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { __be32 ipv4_addr; if (addr->s6_addr[0] == 0xff) return 1; ipv4_addr = addr->s6_addr32[3]; return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL; } #endif /* IB_ADDR_H */
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0-only */ /* * This file is part of the Linux kernel. * * Copyright (c) 2011-2014, Intel Corporation * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> */ #ifndef ASM_X86_ARCHRANDOM_H #define ASM_X86_ARCHRANDOM_H #include <asm/processor.h> #include <asm/cpufeature.h> #define RDRAND_RETRY_LOOPS 10 /* Unconditional execution of RDRAND and RDSEED */ static inline bool __must_check rdrand_long(unsigned long *v) { bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { asm volatile("rdrand %[out]" CC_SET(c) : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); return false; } static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; asm volatile("rdseed %[out]" CC_SET(c) : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } /* * These are the generic interfaces; they must not be declared if the * stubs in <linux/random.h> are to be invoked. */ static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } #ifndef CONFIG_UML void x86_init_rdrand(struct cpuinfo_x86 *c); #endif #endif /* ASM_X86_ARCHRANDOM_H */
68 68 318 318 1 323 236 236 88 66 66 63 3 66 63 83 28 28 28 19 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file) { struct linux_binprm *bprm; if (func == BPRM_CHECK) { bprm = container_of(&file, struct linux_binprm, file); return bprm->is_check; } return false; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (is_bprm_creds_for_exec(func, file)) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig) set_bit(IMA_DIGSIG, &iint->atomic_flags); else clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), 0); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), 0); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 /* SPDX-License-Identifier: GPL-2.0 */ /* * ioport.h Definitions of routines for detecting, reserving and * allocating system resources. * * Authors: Linus Torvalds */ #ifndef _LINUX_IOPORT_H #define _LINUX_IOPORT_H #ifndef __ASSEMBLY__ #include <linux/bits.h> #include <linux/compiler.h> #include <linux/minmax.h> #include <linux/types.h> /* * Resources are tree-like, allowing * nesting etc.. */ struct resource { resource_size_t start; resource_size_t end; const char *name; unsigned long flags; unsigned long desc; struct resource *parent, *sibling, *child; }; /* * IO resources have these defined flags. * * PCI devices expose these flags to userspace in the "resource" sysfs file, * so don't move them. */ #define IORESOURCE_BITS 0x000000ff /* Bus-specific bits */ #define IORESOURCE_TYPE_BITS 0x00001f00 /* Resource type */ #define IORESOURCE_IO 0x00000100 /* PCI/ISA I/O ports */ #define IORESOURCE_MEM 0x00000200 #define IORESOURCE_REG 0x00000300 /* Register offsets */ #define IORESOURCE_IRQ 0x00000400 #define IORESOURCE_DMA 0x00000800 #define IORESOURCE_BUS 0x00001000 #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */ #define IORESOURCE_READONLY 0x00004000 #define IORESOURCE_CACHEABLE 0x00008000 #define IORESOURCE_RANGELENGTH 0x00010000 #define IORESOURCE_SHADOWABLE 0x00020000 #define IORESOURCE_SIZEALIGN 0x00040000 /* size indicates alignment */ #define IORESOURCE_STARTALIGN 0x00080000 /* start field is alignment */ #define IORESOURCE_MEM_64 0x00100000 #define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */ #define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */ #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ /* IORESOURCE_SYSRAM specific bits. */ #define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ #define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 #define IORESOURCE_UNSET 0x20000000 /* No address assigned yet */ #define IORESOURCE_AUTO 0x40000000 #define IORESOURCE_BUSY 0x80000000 /* Driver has marked this resource busy */ /* I/O resource extended types */ #define IORESOURCE_SYSTEM_RAM (IORESOURCE_MEM|IORESOURCE_SYSRAM) /* PnP IRQ specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IRQ_HIGHEDGE (1<<0) #define IORESOURCE_IRQ_LOWEDGE (1<<1) #define IORESOURCE_IRQ_HIGHLEVEL (1<<2) #define IORESOURCE_IRQ_LOWLEVEL (1<<3) #define IORESOURCE_IRQ_SHAREABLE (1<<4) #define IORESOURCE_IRQ_OPTIONAL (1<<5) #define IORESOURCE_IRQ_WAKECAPABLE (1<<6) /* PnP DMA specific bits (IORESOURCE_BITS) */ #define IORESOURCE_DMA_TYPE_MASK (3<<0) #define IORESOURCE_DMA_8BIT (0<<0) #define IORESOURCE_DMA_8AND16BIT (1<<0) #define IORESOURCE_DMA_16BIT (2<<0) #define IORESOURCE_DMA_MASTER (1<<2) #define IORESOURCE_DMA_BYTE (1<<3) #define IORESOURCE_DMA_WORD (1<<4) #define IORESOURCE_DMA_SPEED_MASK (3<<6) #define IORESOURCE_DMA_COMPATIBLE (0<<6) #define IORESOURCE_DMA_TYPEA (1<<6) #define IORESOURCE_DMA_TYPEB (2<<6) #define IORESOURCE_DMA_TYPEF (3<<6) /* PnP memory I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_MEM_WRITEABLE (1<<0) /* dup: IORESOURCE_READONLY */ #define IORESOURCE_MEM_CACHEABLE (1<<1) /* dup: IORESOURCE_CACHEABLE */ #define IORESOURCE_MEM_RANGELENGTH (1<<2) /* dup: IORESOURCE_RANGELENGTH */ #define IORESOURCE_MEM_TYPE_MASK (3<<3) #define IORESOURCE_MEM_8BIT (0<<3) #define IORESOURCE_MEM_16BIT (1<<3) #define IORESOURCE_MEM_8AND16BIT (2<<3) #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) #define IORESOURCE_MEM_NONPOSTED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) #define IORESOURCE_IO_FIXED (1<<1) #define IORESOURCE_IO_SPARSE (1<<2) /* PCI ROM control bits (IORESOURCE_BITS) */ #define IORESOURCE_ROM_ENABLE (1<<0) /* ROM is enabled, same as PCI_ROM_ADDRESS_ENABLE */ #define IORESOURCE_ROM_SHADOW (1<<1) /* Use RAM image, not ROM BAR */ /* PCI control bits. Shares IORESOURCE_BITS with above PCI ROM. */ #define IORESOURCE_PCI_FIXED (1<<4) /* Do not move resource */ #define IORESOURCE_PCI_EA_BEI (1<<5) /* BAR Equivalent Indicator */ /* * I/O Resource Descriptors * * Descriptors are used by walk_iomem_res_desc() and region_intersects() * for searching a specific resource range in the iomem table. Assign * a new descriptor when a resource range supports the search interfaces. * Otherwise, resource.desc must be set to IORES_DESC_NONE (0). */ enum { IORES_DESC_NONE = 0, IORES_DESC_CRASH_KERNEL = 1, IORES_DESC_ACPI_TABLES = 2, IORES_DESC_ACPI_NV_STORAGE = 3, IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_RESERVED = 7, IORES_DESC_SOFT_RESERVED = 8, IORES_DESC_CXL = 9, }; /* * Flags controlling ioremap() behavior. */ enum { IORES_MAP_SYSTEM_RAM = BIT(0), IORES_MAP_ENCRYPTED = BIT(1), }; /* helpers to define resources */ #define DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, _desc) \ (struct resource) { \ .start = (_start), \ .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ .desc = (_desc), \ } #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE) #define DEFINE_RES(_start, _size, _flags) \ DEFINE_RES_NAMED(_start, _size, NULL, _flags) #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) #define DEFINE_RES_IO(_start, _size) \ DEFINE_RES_IO_NAMED((_start), (_size), NULL) #define DEFINE_RES_MEM_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_MEM) #define DEFINE_RES_MEM(_start, _size) \ DEFINE_RES_MEM_NAMED((_start), (_size), NULL) #define DEFINE_RES_REG_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_REG) #define DEFINE_RES_REG(_start, _size) \ DEFINE_RES_REG_NAMED((_start), (_size), NULL) #define DEFINE_RES_IRQ_NAMED(_irq, _name) \ DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ) #define DEFINE_RES_IRQ(_irq) \ DEFINE_RES_IRQ_NAMED((_irq), NULL) #define DEFINE_RES_DMA_NAMED(_dma, _name) \ DEFINE_RES_NAMED((_dma), 1, (_name), IORESOURCE_DMA) #define DEFINE_RES_DMA(_dma) \ DEFINE_RES_DMA_NAMED((_dma), NULL) /** * typedef resource_alignf - Resource alignment callback * @data: Private data used by the callback * @res: Resource candidate range (an empty resource space) * @size: The minimum size of the empty space * @align: Alignment from the constraints * * Callback allows calculating resource placement and alignment beyond min, * max, and align fields in the struct resource_constraint. * * Return: Start address for the resource. */ typedef resource_size_t (*resource_alignf)(void *data, const struct resource *res, resource_size_t size, resource_size_t align); /** * struct resource_constraint - constraints to be met while searching empty * resource space * @min: The minimum address for the memory range * @max: The maximum address for the memory range * @align: Alignment for the start address of the empty space * @alignf: Additional alignment constraints callback * @alignf_data: Data provided for @alignf callback * * Contains the range and alignment constraints that have to be met during * find_resource_space(). @alignf can be NULL indicating no alignment beyond * @align is necessary. */ struct resource_constraint { resource_size_t min, max, align; resource_alignf alignf; void *alignf_data; }; /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); void release_child_resources(struct resource *new); extern void reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name); extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); extern int remove_resource(struct resource *old); extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, resource_alignf alignf, void *alignf_data); struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); resource_size_t resource_alignment(struct resource *res); /** * resource_set_size - Calculate resource end address from size and start * @res: Resource descriptor * @size: Size of the resource * * Calculate the end address for @res based on @size. * * Note: The start address of @res must be set when calling this function. * Prefer resource_set_range() if setting both the start address and @size. */ static inline void resource_set_size(struct resource *res, resource_size_t size) { res->end = res->start + size - 1; } /** * resource_set_range - Set resource start and end addresses * @res: Resource descriptor * @start: Start address for the resource * @size: Size of the resource * * Set @res start address and calculate the end address based on @size. */ static inline void resource_set_range(struct resource *res, resource_size_t start, resource_size_t size) { res->start = start; resource_set_size(res, size); } static inline resource_size_t resource_size(const struct resource *res) { return res->end - res->start + 1; } static inline unsigned long resource_type(const struct resource *res) { return res->flags & IORESOURCE_TYPE_BITS; } static inline unsigned long resource_ext_type(const struct resource *res) { return res->flags & IORESOURCE_EXT_TYPE_BITS; } /* True iff r1 completely contains r2 */ static inline bool resource_contains(const struct resource *r1, const struct resource *r2) { if (resource_type(r1) != resource_type(r2)) return false; if (r1->flags & IORESOURCE_UNSET || r2->flags & IORESOURCE_UNSET) return false; return r1->start <= r2->start && r1->end >= r2->end; } /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(const struct resource *r1, const struct resource *r2) { return r1->start <= r2->end && r1->end >= r2->start; } static inline bool resource_intersection(const struct resource *r1, const struct resource *r2, struct resource *r) { if (!resource_overlaps(r1, r2)) return false; r->start = max(r1->start, r2->start); r->end = min(r1->end, r2->end); return true; } static inline bool resource_union(const struct resource *r1, const struct resource *r2, struct resource *r) { if (!resource_overlaps(r1, r2)) return false; r->start = min(r1->start, r2->start); r->end = max(r1->end, r2->end); return true; } int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint); /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), 0) #define request_muxed_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED) #define __request_mem_region(start,n,name, excl) __request_region(&iomem_resource, (start), (n), (name), excl) #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name), 0) #define request_mem_region_muxed(start, n, name) \ __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_MUXED) #define request_mem_region_exclusive(start,n,name) \ __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_EXCLUSIVE) #define rename_region(region, newname) do { (region)->name = (newname); } while (0) extern struct resource * __request_region(struct resource *, resource_size_t start, resource_size_t n, const char *name, int flags); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) #define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n)) extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE extern void release_mem_region_adjustable(resource_size_t, resource_size_t); #endif #ifdef CONFIG_MEMORY_HOTPLUG extern void merge_system_ram_resource(struct resource *res); #endif /* Wrappers for managed devices */ struct device; extern int devm_request_resource(struct device *dev, struct resource *root, struct resource *new); extern void devm_release_resource(struct device *dev, struct resource *new); #define devm_request_region(dev,start,n,name) \ __devm_request_region(dev, &ioport_resource, (start), (n), (name)) #define devm_request_mem_region(dev,start,n,name) \ __devm_request_region(dev, &iomem_resource, (start), (n), (name)) extern struct resource * __devm_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name); #define devm_release_region(dev, start, n) \ __devm_release_region(dev, &ioport_resource, (start), (n)) #define devm_release_mem_region(dev, start, n) \ __devm_release_region(dev, &iomem_resource, (start), (n)) extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); extern bool iomem_is_exclusive(u64 addr); extern bool resource_is_exclusive(struct resource *resource, u64 addr, resource_size_t size); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); extern int walk_mem_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_system_ram_res_rev(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); struct resource *alloc_free_mem_region(struct resource *base, unsigned long size, unsigned long align, const char *name); static inline void irqresource_disabled(struct resource *res, u32 irq) { res->start = irq; res->end = irq; res->flags |= IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET; } extern struct address_space *iomem_get_mapping(void); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */
25 2200 1 1 1 1 2204 2199 2199 3 1 1 1 1 1 1 1 169 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Interface registration. * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland * * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont * and Sakari Ailus <sakari.ailus@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/spinlock.h> #include <net/netns/generic.h> #include <net/net_namespace.h> #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support"); MODULE_LICENSE("GPL"); /* Used for local tracking of the CAIF net devices */ struct caif_device_entry { struct cflayer layer; struct list_head list; struct net_device *netdev; int __percpu *pcpu_refcnt; spinlock_t flow_lock; struct sk_buff *xoff_skb; void (*xoff_skb_dtor)(struct sk_buff *skb); bool xoff; }; struct caif_device_entry_list { struct list_head list; /* Protects simulanous deletes in list */ struct mutex lock; }; struct caif_net { struct cfcnfg *cfg; struct caif_device_entry_list caifdevs; }; static unsigned int caif_net_id; static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return &caifn->caifdevs; } static void caifd_put(struct caif_device_entry *e) { this_cpu_dec(*e->pcpu_refcnt); } static void caifd_hold(struct caif_device_entry *e) { this_cpu_inc(*e->pcpu_refcnt); } static int caifd_refcnt_read(struct caif_device_entry *e) { int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(e->pcpu_refcnt, i); return refcnt; } /* Allocate new CAIF device. */ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) { struct caif_device_entry *caifd; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) return NULL; caifd->pcpu_refcnt = alloc_percpu(int); if (!caifd->pcpu_refcnt) { kfree(caifd); return NULL; } caifd->netdev = dev; dev_hold(dev); return caifd; } static struct caif_device_entry *caif_get(struct net_device *dev) { struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; list_for_each_entry_rcu(caifd, &caifdevs->list, list, lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } return NULL; } static void caif_flow_cb(struct sk_buff *skb) { struct caif_device_entry *caifd; void (*dtor)(struct sk_buff *skb) = NULL; bool send_xoff; WARN_ON(skb->dev == NULL); rcu_read_lock(); caifd = caif_get(skb->dev); WARN_ON(caifd == NULL); if (!caifd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) skb = NULL; caifd->xoff_skb = NULL; caifd->xoff_skb_dtor = NULL; spin_unlock_bh(&caifd->flow_lock); if (dtor && skb) dtor(skb); if (send_xoff) caifd->layer.up-> ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, caifd->layer.id); caifd_put(caifd); } static int transmit(struct cflayer *layer, struct cfpkt *pkt) { int err, high = 0, qlen = 0; struct caif_device_entry *caifd = container_of(layer, struct caif_device_entry, layer); struct sk_buff *skb; struct netdev_queue *txq; rcu_read_lock_bh(); skb = cfpkt_tonative(pkt); skb->dev = caifd->netdev; skb_reset_network_header(skb); skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { struct Qdisc *sch; /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); sch = rcu_dereference_bh(txq->qdisc); if (likely(qdisc_is_empty(sch))) goto noxoff; /* can check for explicit qdisc len value only !NOLOCK, * always set flow off otherwise */ high = (caifd->netdev->tx_queue_len * q_high) / 100; if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } /* Hold lock while accessing xoff */ spin_lock_bh(&caifd->flow_lock); if (caifd->xoff) { spin_unlock_bh(&caifd->flow_lock); goto noxoff; } /* * Handle flow off, we do this by temporary hi-jacking this * skb's destructor function, and replace it with our own * flow-on callback. The callback will set flow-on and call * the original destructor. */ pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; spin_unlock_bh(&caifd->flow_lock); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); noxoff: rcu_read_unlock_bh(); err = dev_queue_xmit(skb); if (err > 0) err = -EIO; return err; } /* * Stuff received packets into the CAIF stack. * On error, returns non-zero and releases the skb. */ static int receive(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct cfpkt *pkt; struct caif_device_entry *caifd; int err; pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->receive || !netif_oper_up(caifd->netdev)) { rcu_read_unlock(); kfree_skb(skb); return NET_RX_DROP; } /* Hold reference to netdevice while using CAIF stack */ caifd_hold(caifd); rcu_read_unlock(); err = caifd->layer.up->receive(caifd->layer.up, pkt); /* For -EILSEQ the packet is not freed so free it now */ if (err == -EILSEQ) cfpkt_destroy(pkt); /* Release reference to stack upwards */ caifd_put(caifd); if (err != 0) err = NET_RX_DROP; return err; } static struct packet_type caif_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_CAIF), .func = receive, }; static void dev_flowctrl(struct net_device *dev, int on) { struct caif_device_entry *caifd; rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, on ? _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND : _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); caifd_put(caifd); } int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct caif_device_entry *caifd; enum cfcnfg_phy_preference pref; struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); struct caif_device_entry_list *caifdevs; int res; caifdevs = caif_device_list(dev_net(dev)); caifd = caif_device_alloc(dev); if (!caifd) return -ENOMEM; *layer = &caifd->layer; spin_lock_init(&caifd->flow_lock); switch (caifdev->link_select) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: pref = CFPHYPREF_HIGH_BW; break; } mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); strscpy(caifd->layer.name, dev->name, sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; res = cfcnfg_add_phy_layer(cfg, dev, &caifd->layer, pref, link_support, caifdev->use_fcs, head_room); mutex_unlock(&caifdevs->lock); if (rcv_func) *rcv_func = receive; return res; } EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; struct cflayer *layer, *link_support; int head_room = 0; struct caif_device_entry_list *caifdevs; int res; cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) return 0; switch (what) { case NETDEV_REGISTER: if (caifd != NULL) break; caifdev = netdev_priv(dev); link_support = NULL; if (caifdev->use_frag) { head_room = 1; link_support = cfserl_create(dev->ifindex, caifdev->use_stx); if (!link_support) { pr_warn("Out of memory\n"); break; } } res = caif_enroll_dev(dev, caifdev, link_support, head_room, &layer, NULL); if (res) cfserl_release(link_support); caifdev->flowctrl = dev_flowctrl; break; case NETDEV_UP: rcu_read_lock(); caifd = caif_get(dev); if (caifd == NULL) { rcu_read_unlock(); break; } caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); break; case NETDEV_DOWN: rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return -EINVAL; } cfcnfg_set_phy_state(cfg, &caifd->layer, false); caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_DOWN_IND, caifd->layer.id); spin_lock_bh(&caifd->flow_lock); /* * Replace our xoff-destructor with original destructor. * We trust that skb->destructor *always* is called before * the skb reference is invalid. The hijacked SKB destructor * takes the flow_lock so manipulating the skb->destructor here * should be safe. */ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; spin_unlock_bh(&caifd->flow_lock); caifd_put(caifd); break; case NETDEV_UNREGISTER: mutex_lock(&caifdevs->lock); caifd = caif_get(dev); if (caifd == NULL) { mutex_unlock(&caifdevs->lock); break; } list_del_rcu(&caifd->list); /* * NETDEV_UNREGISTER is called repeatedly until all reference * counts for the net-device are released. If references to * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for * the next call to NETDEV_UNREGISTER. * * If any packets are in flight down the CAIF Stack, * cfcnfg_del_phy_layer will return nonzero. * If no packets are in flight, the CAIF Stack associated * with the net-device un-registering is freed. */ if (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) { pr_info("Wait for device inuse\n"); /* Enrole device if CAIF Stack is still in use */ list_add_rcu(&caifd->list, &caifdevs->list); mutex_unlock(&caifdevs->lock); break; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); mutex_unlock(&caifdevs->lock); break; } return 0; } static struct notifier_block caif_device_notifier = { .notifier_call = caif_device_notify, .priority = 0, }; /* Per-namespace Caif devices handling */ static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); caifn->cfg = cfcnfg_create(); if (!caifn->cfg) return -ENOMEM; return 0; } static void caif_exit_net(struct net *net) { struct caif_device_entry *caifd, *tmp; struct caif_device_entry_list *caifdevs = caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); rtnl_lock(); mutex_lock(&caifdevs->lock); list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { int i = 0; list_del_rcu(&caifd->list); cfcnfg_set_phy_state(cfg, &caifd->layer, false); while (i < 10 && (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) { pr_info("Wait for device inuse\n"); msleep(250); i++; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); } cfcnfg_remove(cfg); mutex_unlock(&caifdevs->lock); rtnl_unlock(); } static struct pernet_operations caif_net_ops = { .init = caif_init_net, .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), }; /* Initialize Caif devices list */ static int __init caif_device_init(void) { int result; result = register_pernet_subsys(&caif_net_ops); if (result) return result; register_netdevice_notifier(&caif_device_notifier); dev_add_pack(&caif_packet_type); return result; } static void __exit caif_device_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); unregister_pernet_subsys(&caif_net_ops); } module_init(caif_device_init); module_exit(caif_device_exit);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the IPPROTO_SMC (socket related) * * Copyright IBM Corp. 2016, 2018 * Copyright (c) 2024, Alibaba Inc. * * Author: D. Wythe <alibuda@linux.alibaba.com> */ #include <net/protocol.h> #include <net/sock.h> #include "smc_inet.h" #include "smc.h" static int smc_inet_init_sock(struct sock *sk); static struct proto smc_inet_prot = { .name = "INET_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; static const struct proto_ops smc_inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet_prot, .ops = &smc_inet_stream_ops, .flags = INET_PROTOSW_ICSK, }; #if IS_ENABLED(CONFIG_IPV6) struct smc6_sock { struct smc_sock smc; struct ipv6_pinfo inet6; }; static struct proto smc_inet6_prot = { .name = "INET6_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc6_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), }; static const struct proto_ops smc_inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet6_prot, .ops = &smc_inet6_stream_ops, .flags = INET_PROTOSW_ICSK, }; #endif /* CONFIG_IPV6 */ static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) { /* No need pass it through to clcsock, mss can always be set by * sock_create_kern or smc_setsockopt. */ return 0; } static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); inet_csk(sk)->icsk_sync_mss = smc_sync_mss; /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } int __init smc_inet_init(void) { int rc; rc = proto_register(&smc_inet_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet_prot fails with %d\n", __func__, rc); return rc; } /* no return value */ inet_register_protosw(&smc_inet_protosw); #if IS_ENABLED(CONFIG_IPV6) rc = proto_register(&smc_inet6_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet6_prot fails with %d\n", __func__, rc); goto out_inet6_prot; } rc = inet6_register_protosw(&smc_inet6_protosw); if (rc) { pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", __func__, rc); goto out_inet6_protosw; } return rc; out_inet6_protosw: proto_unregister(&smc_inet6_prot); out_inet6_prot: inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); #endif /* CONFIG_IPV6 */ return rc; } void smc_inet_exit(void) { #if IS_ENABLED(CONFIG_IPV6) inet6_unregister_protosw(&smc_inet6_protosw); proto_unregister(&smc_inet6_prot); #endif /* CONFIG_IPV6 */ inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 // SPDX-License-Identifier: GPL-2.0+ /* * Belkin USB Serial Adapter Driver * * Copyright (C) 2000 William Greathouse (wgreathouse@smva.com) * Copyright (C) 2000-2001 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2010 Johan Hovold (jhovold@gmail.com) * * This program is largely derived from work by the linux-usb group * and associated source files. Please see the usb/serial files for * individual credits and copyrights. * * See Documentation/usb/usb-serial.rst for more information on using this * driver * * TODO: * -- Add true modem control line query capability. Currently we track the * states reported by the interrupt and the states we request. * -- Add support for flush commands */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include "belkin_sa.h" #define DRIVER_AUTHOR "William Greathouse <wgreathouse@smva.com>" #define DRIVER_DESC "USB Belkin Serial converter driver" /* function prototypes for a Belkin USB Serial Adapter F5U103 */ static int belkin_sa_port_probe(struct usb_serial_port *port); static void belkin_sa_port_remove(struct usb_serial_port *port); static int belkin_sa_open(struct tty_struct *tty, struct usb_serial_port *port); static void belkin_sa_close(struct usb_serial_port *port); static void belkin_sa_read_int_callback(struct urb *urb); static void belkin_sa_process_read_urb(struct urb *urb); static void belkin_sa_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios); static int belkin_sa_break_ctl(struct tty_struct *tty, int break_state); static int belkin_sa_tiocmget(struct tty_struct *tty); static int belkin_sa_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear); static const struct usb_device_id id_table[] = { { USB_DEVICE(BELKIN_SA_VID, BELKIN_SA_PID) }, { USB_DEVICE(BELKIN_OLD_VID, BELKIN_OLD_PID) }, { USB_DEVICE(PERACOM_VID, PERACOM_PID) }, { USB_DEVICE(GOHUBS_VID, GOHUBS_PID) }, { USB_DEVICE(GOHUBS_VID, HANDYLINK_PID) }, { USB_DEVICE(BELKIN_DOCKSTATION_VID, BELKIN_DOCKSTATION_PID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); /* All of the device info needed for the serial converters */ static struct usb_serial_driver belkin_device = { .driver = { .name = "belkin", }, .description = "Belkin / Peracom / GoHubs USB Serial Adapter", .id_table = id_table, .num_ports = 1, .open = belkin_sa_open, .close = belkin_sa_close, .read_int_callback = belkin_sa_read_int_callback, .process_read_urb = belkin_sa_process_read_urb, .set_termios = belkin_sa_set_termios, .break_ctl = belkin_sa_break_ctl, .tiocmget = belkin_sa_tiocmget, .tiocmset = belkin_sa_tiocmset, .port_probe = belkin_sa_port_probe, .port_remove = belkin_sa_port_remove, }; static struct usb_serial_driver * const serial_drivers[] = { &belkin_device, NULL }; struct belkin_sa_private { spinlock_t lock; unsigned long control_state; unsigned char last_lsr; unsigned char last_msr; int bad_flow_control; }; /* * *************************************************************************** * Belkin USB Serial Adapter F5U103 specific driver functions * *************************************************************************** */ #define WDR_TIMEOUT 5000 /* default urb timeout */ /* assumes that struct usb_serial *serial is available */ #define BSA_USB_CMD(c, v) usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), \ (c), BELKIN_SA_SET_REQUEST_TYPE, \ (v), 0, NULL, 0, WDR_TIMEOUT) static int belkin_sa_port_probe(struct usb_serial_port *port) { struct usb_device *dev = port->serial->dev; struct belkin_sa_private *priv; priv = kmalloc(sizeof(struct belkin_sa_private), GFP_KERNEL); if (!priv) return -ENOMEM; spin_lock_init(&priv->lock); priv->control_state = 0; priv->last_lsr = 0; priv->last_msr = 0; /* see comments at top of file */ priv->bad_flow_control = (le16_to_cpu(dev->descriptor.bcdDevice) <= 0x0206) ? 1 : 0; dev_info(&dev->dev, "bcdDevice: %04x, bfc: %d\n", le16_to_cpu(dev->descriptor.bcdDevice), priv->bad_flow_control); usb_set_serial_port_data(port, priv); return 0; } static void belkin_sa_port_remove(struct usb_serial_port *port) { struct belkin_sa_private *priv; priv = usb_get_serial_port_data(port); kfree(priv); } static int belkin_sa_open(struct tty_struct *tty, struct usb_serial_port *port) { int retval; retval = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (retval) { dev_err(&port->dev, "usb_submit_urb(read int) failed\n"); return retval; } retval = usb_serial_generic_open(tty, port); if (retval) usb_kill_urb(port->interrupt_in_urb); return retval; } static void belkin_sa_close(struct usb_serial_port *port) { usb_serial_generic_close(port); usb_kill_urb(port->interrupt_in_urb); } static void belkin_sa_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct belkin_sa_private *priv; unsigned char *data = urb->transfer_buffer; int retval; int status = urb->status; unsigned long flags; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&port->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&port->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } usb_serial_debug_data(&port->dev, __func__, urb->actual_length, data); /* Handle known interrupt data */ /* ignore data[0] and data[1] */ priv = usb_get_serial_port_data(port); spin_lock_irqsave(&priv->lock, flags); priv->last_msr = data[BELKIN_SA_MSR_INDEX]; /* Record Control Line states */ if (priv->last_msr & BELKIN_SA_MSR_DSR) priv->control_state |= TIOCM_DSR; else priv->control_state &= ~TIOCM_DSR; if (priv->last_msr & BELKIN_SA_MSR_CTS) priv->control_state |= TIOCM_CTS; else priv->control_state &= ~TIOCM_CTS; if (priv->last_msr & BELKIN_SA_MSR_RI) priv->control_state |= TIOCM_RI; else priv->control_state &= ~TIOCM_RI; if (priv->last_msr & BELKIN_SA_MSR_CD) priv->control_state |= TIOCM_CD; else priv->control_state &= ~TIOCM_CD; priv->last_lsr = data[BELKIN_SA_LSR_INDEX]; spin_unlock_irqrestore(&priv->lock, flags); exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&port->dev, "%s - usb_submit_urb failed with " "result %d\n", __func__, retval); } static void belkin_sa_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; struct belkin_sa_private *priv = usb_get_serial_port_data(port); unsigned char *data = urb->transfer_buffer; unsigned long flags; unsigned char status; char tty_flag; /* Update line status */ tty_flag = TTY_NORMAL; spin_lock_irqsave(&priv->lock, flags); status = priv->last_lsr; priv->last_lsr &= ~BELKIN_SA_LSR_ERR; spin_unlock_irqrestore(&priv->lock, flags); if (!urb->actual_length) return; if (status & BELKIN_SA_LSR_ERR) { /* Break takes precedence over parity, which takes precedence * over framing errors. */ if (status & BELKIN_SA_LSR_BI) tty_flag = TTY_BREAK; else if (status & BELKIN_SA_LSR_PE) tty_flag = TTY_PARITY; else if (status & BELKIN_SA_LSR_FE) tty_flag = TTY_FRAME; dev_dbg(&port->dev, "tty_flag = %d\n", tty_flag); /* Overrun is special, not associated with a char. */ if (status & BELKIN_SA_LSR_OE) tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } tty_insert_flip_string_fixed_flag(&port->port, data, tty_flag, urb->actual_length); tty_flip_buffer_push(&port->port); } static void belkin_sa_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_serial *serial = port->serial; struct belkin_sa_private *priv = usb_get_serial_port_data(port); unsigned int iflag; unsigned int cflag; unsigned int old_iflag = 0; unsigned int old_cflag = 0; __u16 urb_value = 0; /* Will hold the new flags */ unsigned long flags; unsigned long control_state; int bad_flow_control; speed_t baud; struct ktermios *termios = &tty->termios; iflag = termios->c_iflag; cflag = termios->c_cflag; termios->c_cflag &= ~CMSPAR; /* get a local copy of the current port settings */ spin_lock_irqsave(&priv->lock, flags); control_state = priv->control_state; bad_flow_control = priv->bad_flow_control; spin_unlock_irqrestore(&priv->lock, flags); old_iflag = old_termios->c_iflag; old_cflag = old_termios->c_cflag; /* Set the baud rate */ if ((cflag & CBAUD) != (old_cflag & CBAUD)) { /* reassert DTR and (maybe) RTS on transition from B0 */ if ((old_cflag & CBAUD) == B0) { control_state |= (TIOCM_DTR|TIOCM_RTS); if (BSA_USB_CMD(BELKIN_SA_SET_DTR_REQUEST, 1) < 0) dev_err(&port->dev, "Set DTR error\n"); /* don't set RTS if using hardware flow control */ if (!(old_cflag & CRTSCTS)) if (BSA_USB_CMD(BELKIN_SA_SET_RTS_REQUEST , 1) < 0) dev_err(&port->dev, "Set RTS error\n"); } } baud = tty_get_baud_rate(tty); if (baud) { urb_value = BELKIN_SA_BAUD(baud); /* Clip to maximum speed */ if (urb_value == 0) urb_value = 1; /* Turn it back into a resulting real baud rate */ baud = BELKIN_SA_BAUD(urb_value); /* Report the actual baud rate back to the caller */ tty_encode_baud_rate(tty, baud, baud); if (BSA_USB_CMD(BELKIN_SA_SET_BAUDRATE_REQUEST, urb_value) < 0) dev_err(&port->dev, "Set baudrate error\n"); } else { /* Disable flow control */ if (BSA_USB_CMD(BELKIN_SA_SET_FLOW_CTRL_REQUEST, BELKIN_SA_FLOW_NONE) < 0) dev_err(&port->dev, "Disable flowcontrol error\n"); /* Drop RTS and DTR */ control_state &= ~(TIOCM_DTR | TIOCM_RTS); if (BSA_USB_CMD(BELKIN_SA_SET_DTR_REQUEST, 0) < 0) dev_err(&port->dev, "DTR LOW error\n"); if (BSA_USB_CMD(BELKIN_SA_SET_RTS_REQUEST, 0) < 0) dev_err(&port->dev, "RTS LOW error\n"); } /* set the parity */ if ((cflag ^ old_cflag) & (PARENB | PARODD)) { if (cflag & PARENB) urb_value = (cflag & PARODD) ? BELKIN_SA_PARITY_ODD : BELKIN_SA_PARITY_EVEN; else urb_value = BELKIN_SA_PARITY_NONE; if (BSA_USB_CMD(BELKIN_SA_SET_PARITY_REQUEST, urb_value) < 0) dev_err(&port->dev, "Set parity error\n"); } /* set the number of data bits */ if ((cflag & CSIZE) != (old_cflag & CSIZE)) { urb_value = BELKIN_SA_DATA_BITS(tty_get_char_size(cflag)); if (BSA_USB_CMD(BELKIN_SA_SET_DATA_BITS_REQUEST, urb_value) < 0) dev_err(&port->dev, "Set data bits error\n"); } /* set the number of stop bits */ if ((cflag & CSTOPB) != (old_cflag & CSTOPB)) { urb_value = (cflag & CSTOPB) ? BELKIN_SA_STOP_BITS(2) : BELKIN_SA_STOP_BITS(1); if (BSA_USB_CMD(BELKIN_SA_SET_STOP_BITS_REQUEST, urb_value) < 0) dev_err(&port->dev, "Set stop bits error\n"); } /* Set flow control */ if (((iflag ^ old_iflag) & (IXOFF | IXON)) || ((cflag ^ old_cflag) & CRTSCTS)) { urb_value = 0; if ((iflag & IXOFF) || (iflag & IXON)) urb_value |= (BELKIN_SA_FLOW_OXON | BELKIN_SA_FLOW_IXON); else urb_value &= ~(BELKIN_SA_FLOW_OXON | BELKIN_SA_FLOW_IXON); if (cflag & CRTSCTS) urb_value |= (BELKIN_SA_FLOW_OCTS | BELKIN_SA_FLOW_IRTS); else urb_value &= ~(BELKIN_SA_FLOW_OCTS | BELKIN_SA_FLOW_IRTS); if (bad_flow_control) urb_value &= ~(BELKIN_SA_FLOW_IRTS); if (BSA_USB_CMD(BELKIN_SA_SET_FLOW_CTRL_REQUEST, urb_value) < 0) dev_err(&port->dev, "Set flow control error\n"); } /* save off the modified port settings */ spin_lock_irqsave(&priv->lock, flags); priv->control_state = control_state; spin_unlock_irqrestore(&priv->lock, flags); } static int belkin_sa_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; int ret; ret = BSA_USB_CMD(BELKIN_SA_SET_BREAK_REQUEST, break_state ? 1 : 0); if (ret < 0) { dev_err(&port->dev, "Set break_ctl %d\n", break_state); return ret; } return 0; } static int belkin_sa_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct belkin_sa_private *priv = usb_get_serial_port_data(port); unsigned long control_state; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); control_state = priv->control_state; spin_unlock_irqrestore(&priv->lock, flags); return control_state; } static int belkin_sa_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; struct belkin_sa_private *priv = usb_get_serial_port_data(port); unsigned long control_state; unsigned long flags; int retval; int rts = 0; int dtr = 0; spin_lock_irqsave(&priv->lock, flags); control_state = priv->control_state; if (set & TIOCM_RTS) { control_state |= TIOCM_RTS; rts = 1; } if (set & TIOCM_DTR) { control_state |= TIOCM_DTR; dtr = 1; } if (clear & TIOCM_RTS) { control_state &= ~TIOCM_RTS; rts = 0; } if (clear & TIOCM_DTR) { control_state &= ~TIOCM_DTR; dtr = 0; } priv->control_state = control_state; spin_unlock_irqrestore(&priv->lock, flags); retval = BSA_USB_CMD(BELKIN_SA_SET_RTS_REQUEST, rts); if (retval < 0) { dev_err(&port->dev, "Set RTS error %d\n", retval); goto exit; } retval = BSA_USB_CMD(BELKIN_SA_SET_DTR_REQUEST, dtr); if (retval < 0) { dev_err(&port->dev, "Set DTR error %d\n", retval); goto exit; } exit: return retval; } module_usb_serial_driver(serial_drivers, id_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0 /* * Symbol USB barcode to serial driver * * Copyright (C) 2013 Johan Hovold <jhovold@gmail.com> * Copyright (C) 2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (C) 2009 Novell Inc. */ #include <linux/kernel.h> #include <linux/tty.h> #include <linux/slab.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/uaccess.h> static const struct usb_device_id id_table[] = { { USB_DEVICE(0x05e0, 0x0600) }, { }, }; MODULE_DEVICE_TABLE(usb, id_table); struct symbol_private { spinlock_t lock; /* protects the following flags */ bool throttled; bool actually_throttled; }; static void symbol_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct symbol_private *priv = usb_get_serial_port_data(port); unsigned char *data = urb->transfer_buffer; int status = urb->status; unsigned long flags; int result; int data_length; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&port->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&port->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } usb_serial_debug_data(&port->dev, __func__, urb->actual_length, data); /* * Data from the device comes with a 1 byte header: * * <size of data> <data>... */ if (urb->actual_length > 1) { data_length = data[0]; if (data_length > (urb->actual_length - 1)) data_length = urb->actual_length - 1; tty_insert_flip_string(&port->port, &data[1], data_length); tty_flip_buffer_push(&port->port); } else { dev_dbg(&port->dev, "%s - short packet\n", __func__); } exit: spin_lock_irqsave(&priv->lock, flags); /* Continue trying to always read if we should */ if (!priv->throttled) { result = usb_submit_urb(port->interrupt_in_urb, GFP_ATOMIC); if (result) dev_err(&port->dev, "%s - failed resubmitting read urb, error %d\n", __func__, result); } else priv->actually_throttled = true; spin_unlock_irqrestore(&priv->lock, flags); } static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port) { struct symbol_private *priv = usb_get_serial_port_data(port); unsigned long flags; int result = 0; spin_lock_irqsave(&priv->lock, flags); priv->throttled = false; priv->actually_throttled = false; spin_unlock_irqrestore(&priv->lock, flags); /* Start reading from the device */ result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) dev_err(&port->dev, "%s - failed resubmitting read urb, error %d\n", __func__, result); return result; } static void symbol_close(struct usb_serial_port *port) { usb_kill_urb(port->interrupt_in_urb); } static void symbol_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct symbol_private *priv = usb_get_serial_port_data(port); spin_lock_irq(&priv->lock); priv->throttled = true; spin_unlock_irq(&priv->lock); } static void symbol_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct symbol_private *priv = usb_get_serial_port_data(port); int result; bool was_throttled; spin_lock_irq(&priv->lock); priv->throttled = false; was_throttled = priv->actually_throttled; priv->actually_throttled = false; spin_unlock_irq(&priv->lock); if (was_throttled) { result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) dev_err(&port->dev, "%s - failed submitting read urb, error %d\n", __func__, result); } } static int symbol_port_probe(struct usb_serial_port *port) { struct symbol_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; spin_lock_init(&priv->lock); usb_set_serial_port_data(port, priv); return 0; } static void symbol_port_remove(struct usb_serial_port *port) { struct symbol_private *priv = usb_get_serial_port_data(port); kfree(priv); } static struct usb_serial_driver symbol_device = { .driver = { .name = "symbol", }, .id_table = id_table, .num_ports = 1, .num_interrupt_in = 1, .port_probe = symbol_port_probe, .port_remove = symbol_port_remove, .open = symbol_open, .close = symbol_close, .throttle = symbol_throttle, .unthrottle = symbol_unthrottle, .read_int_callback = symbol_int_callback, }; static struct usb_serial_driver * const serial_drivers[] = { &symbol_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_DESCRIPTION("Symbol USB barcode to serial driver"); MODULE_LICENSE("GPL v2");
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0+ /* * nzxt-kraken2.c - hwmon driver for NZXT Kraken X42/X52/X62/X72 coolers * * The device asynchronously sends HID reports (with id 0x04) twice a second to * communicate current fan speed, pump speed and coolant temperature. The * device does not respond to Get_Report requests for this status report. * * Copyright 2019-2021 Jonas Malaco <jonas@protocubo.io> */ #include <linux/unaligned.h> #include <linux/hid.h> #include <linux/hwmon.h> #include <linux/jiffies.h> #include <linux/module.h> #define STATUS_REPORT_ID 0x04 #define STATUS_VALIDITY 2 /* seconds; equivalent to 4 missed updates */ static const char *const kraken2_temp_label[] = { "Coolant", }; static const char *const kraken2_fan_label[] = { "Fan", "Pump", }; struct kraken2_priv_data { struct hid_device *hid_dev; struct device *hwmon_dev; s32 temp_input[1]; u16 fan_input[2]; unsigned long updated; /* jiffies */ }; static int kraken2_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { struct kraken2_priv_data *priv = dev_get_drvdata(dev); if (time_after(jiffies, priv->updated + STATUS_VALIDITY * HZ)) return -ENODATA; switch (type) { case hwmon_temp: *val = priv->temp_input[channel]; break; case hwmon_fan: *val = priv->fan_input[channel]; break; default: return -EOPNOTSUPP; /* unreachable */ } return 0; } static int kraken2_read_string(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, const char **str) { switch (type) { case hwmon_temp: *str = kraken2_temp_label[channel]; break; case hwmon_fan: *str = kraken2_fan_label[channel]; break; default: return -EOPNOTSUPP; /* unreachable */ } return 0; } static const struct hwmon_ops kraken2_hwmon_ops = { .visible = 0444, .read = kraken2_read, .read_string = kraken2_read_string, }; static const struct hwmon_channel_info * const kraken2_info[] = { HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_LABEL), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL, HWMON_F_INPUT | HWMON_F_LABEL), NULL }; static const struct hwmon_chip_info kraken2_chip_info = { .ops = &kraken2_hwmon_ops, .info = kraken2_info, }; static int kraken2_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct kraken2_priv_data *priv; if (size < 7 || report->id != STATUS_REPORT_ID) return 0; priv = hid_get_drvdata(hdev); /* * The fractional byte of the coolant temperature has been observed to * be in the interval [1,9], but some of these steps are also * consistently skipped for certain integer parts. * * For the lack of a better idea, assume that the resolution is 0.1°C, * and that the missing steps are artifacts of how the firmware * processes the raw sensor data. */ priv->temp_input[0] = data[1] * 1000 + data[2] * 100; priv->fan_input[0] = get_unaligned_be16(data + 3); priv->fan_input[1] = get_unaligned_be16(data + 5); priv->updated = jiffies; return 0; } static int kraken2_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct kraken2_priv_data *priv; int ret; priv = devm_kzalloc(&hdev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; priv->hid_dev = hdev; hid_set_drvdata(hdev, priv); /* * Initialize ->updated to STATUS_VALIDITY seconds in the past, making * the initial empty data invalid for kraken2_read without the need for * a special case there. */ priv->updated = jiffies - STATUS_VALIDITY * HZ; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "hid parse failed with %d\n", ret); return ret; } /* * Enable hidraw so existing user-space tools can continue to work. */ ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW); if (ret) { hid_err(hdev, "hid hw start failed with %d\n", ret); return ret; } ret = hid_hw_open(hdev); if (ret) { hid_err(hdev, "hid hw open failed with %d\n", ret); goto fail_and_stop; } priv->hwmon_dev = hwmon_device_register_with_info(&hdev->dev, "kraken2", priv, &kraken2_chip_info, NULL); if (IS_ERR(priv->hwmon_dev)) { ret = PTR_ERR(priv->hwmon_dev); hid_err(hdev, "hwmon registration failed with %d\n", ret); goto fail_and_close; } return 0; fail_and_close: hid_hw_close(hdev); fail_and_stop: hid_hw_stop(hdev); return ret; } static void kraken2_remove(struct hid_device *hdev) { struct kraken2_priv_data *priv = hid_get_drvdata(hdev); hwmon_device_unregister(priv->hwmon_dev); hid_hw_close(hdev); hid_hw_stop(hdev); } static const struct hid_device_id kraken2_table[] = { { HID_USB_DEVICE(0x1e71, 0x170e) }, /* NZXT Kraken X42/X52/X62/X72 */ { } }; MODULE_DEVICE_TABLE(hid, kraken2_table); static struct hid_driver kraken2_driver = { .name = "nzxt-kraken2", .id_table = kraken2_table, .probe = kraken2_probe, .remove = kraken2_remove, .raw_event = kraken2_raw_event, }; static int __init kraken2_init(void) { return hid_register_driver(&kraken2_driver); } static void __exit kraken2_exit(void) { hid_unregister_driver(&kraken2_driver); } /* * When compiled into the kernel, initialize after the hid bus. */ late_initcall(kraken2_init); module_exit(kraken2_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jonas Malaco <jonas@protocubo.io>"); MODULE_DESCRIPTION("Hwmon driver for NZXT Kraken X42/X52/X62/X72 coolers");
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 // SPDX-License-Identifier: GPL-2.0-only /* * inode.c - securityfs * * Copyright (C) 2005 Greg Kroah-Hartman <gregkh@suse.de> * * Based on fs/debugfs/inode.c which had the following copyright notice: * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. */ /* #define DEBUG */ #include <linux/sysfs.h> #include <linux/kobject.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include <linux/magic.h> static struct vfsmount *mount; static int mount_count; static void securityfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations securityfs_super_operations = { .statfs = simple_statfs, .free_inode = securityfs_free_inode, }; static int securityfs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr files[] = {{""}}; int error; error = simple_fill_super(sb, SECURITYFS_MAGIC, files); if (error) return error; sb->s_op = &securityfs_super_operations; return 0; } static int securityfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, securityfs_fill_super); } static const struct fs_context_operations securityfs_context_ops = { .get_tree = securityfs_get_tree, }; static int securityfs_init_fs_context(struct fs_context *fc) { fc->ops = &securityfs_context_ops; return 0; } static struct file_system_type fs_type = { .owner = THIS_MODULE, .name = "securityfs", .init_fs_context = securityfs_init_fs_context, .kill_sb = kill_litter_super, }; /** * securityfs_create_dentry - create a dentry in the securityfs filesystem * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @iops: a point to a struct of inode_operations that should be used for * this file/dir * * This is the basic "create a file/dir/symlink" function for * securityfs. It allows for a wide range of flexibility in creating * a file, or a directory (if you want to create a directory, the * securityfs_create_dir() function is recommended to be used * instead). * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the * file is to be removed (no automatic cleanup happens if your module * is unloaded, you are responsible here). If an error occurs, the * function will return the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, const struct inode_operations *iops) { struct dentry *dentry; struct inode *dir, *inode; int error; if (!(mode & S_IFMT)) mode = (mode & S_IALLUGO) | S_IFREG; pr_debug("securityfs: creating file '%s'\n",name); error = simple_pin_fs(&fs_type, &mount, &mount_count); if (error) return ERR_PTR(error); if (!parent) parent = mount->mnt_root; dir = d_inode(parent); inode_lock(dir); dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) goto out; if (d_really_is_positive(dentry)) { error = -EEXIST; goto out1; } inode = new_inode(dir->i_sb); if (!inode) { error = -ENOMEM; goto out1; } inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); inode->i_private = data; if (S_ISDIR(mode)) { inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inc_nlink(inode); inc_nlink(dir); } else if (S_ISLNK(mode)) { inode->i_op = iops ? iops : &simple_symlink_inode_operations; inode->i_link = data; } else { inode->i_fop = fops; } d_instantiate(dentry, inode); dget(dentry); inode_unlock(dir); return dentry; out1: dput(dentry); dentry = ERR_PTR(error); out: inode_unlock(dir); simple_release_fs(&mount, &mount_count); return dentry; } /** * securityfs_create_file - create a file in the securityfs filesystem * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the securityfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This function creates a file in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return securityfs_create_dentry(name, mode, parent, data, fops, NULL); } EXPORT_SYMBOL_GPL(securityfs_create_file); /** * securityfs_create_dir - create a directory in the securityfs filesystem * * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * * This function creates a directory in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_dir(const char *name, struct dentry *parent) { return securityfs_create_file(name, S_IFDIR | 0755, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(securityfs_create_dir); /** * securityfs_create_symlink - create a symlink in the securityfs filesystem * * @name: a pointer to a string containing the name of the symlink to * create. * @parent: a pointer to the parent dentry for the symlink. This should be a * directory dentry if set. If this parameter is %NULL, then the * directory will be created in the root of the securityfs filesystem. * @target: a pointer to a string containing the name of the symlink's target. * If this parameter is %NULL, then the @iops parameter needs to be * setup to handle .readlink and .get_link inode_operations. * @iops: a pointer to the struct inode_operations to use for the symlink. If * this parameter is %NULL, then the default simple_symlink_inode * operations will be used. * * This function creates a symlink in securityfs with the given @name. * * This function returns a pointer to a dentry if it succeeds. This * pointer must be passed to the securityfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here). If an error occurs, the function will return * the error value (via ERR_PTR). * * If securityfs is not enabled in the kernel, the value %-ENODEV is * returned. */ struct dentry *securityfs_create_symlink(const char *name, struct dentry *parent, const char *target, const struct inode_operations *iops) { struct dentry *dent; char *link = NULL; if (target) { link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); } dent = securityfs_create_dentry(name, S_IFLNK | 0444, parent, link, NULL, iops); if (IS_ERR(dent)) kfree(link); return dent; } EXPORT_SYMBOL_GPL(securityfs_create_symlink); /** * securityfs_remove - removes a file or directory from the securityfs filesystem * * @dentry: a pointer to a the dentry of the file or directory to be removed. * * This function removes a file or directory in securityfs that was previously * created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed. No automatic cleanup of files will happen when a module is * removed; you are responsible here. */ void securityfs_remove(struct dentry *dentry) { struct inode *dir; if (IS_ERR_OR_NULL(dentry)) return; dir = d_inode(dentry->d_parent); inode_lock(dir); if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(dir, dentry); else simple_unlink(dir, dentry); dput(dentry); } inode_unlock(dir); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_remove); static void remove_one(struct dentry *victim) { simple_release_fs(&mount, &mount_count); } /** * securityfs_recursive_remove - recursively removes a file or directory * * @dentry: a pointer to a the dentry of the file or directory to be removed. * * This function recursively removes a file or directory in securityfs that was * previously created with a call to another securityfs function (like * securityfs_create_file() or variants thereof.) */ void securityfs_recursive_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&fs_type, &mount, &mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_recursive_remove); #ifdef CONFIG_SECURITY static struct dentry *lsm_dentry; static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { return simple_read_from_buffer(buf, count, ppos, lsm_names, strlen(lsm_names)); } static const struct file_operations lsm_ops = { .read = lsm_read, .llseek = generic_file_llseek, }; #endif static int __init securityfs_init(void) { int retval; retval = sysfs_create_mount_point(kernel_kobj, "security"); if (retval) return retval; retval = register_filesystem(&fs_type); if (retval) { sysfs_remove_mount_point(kernel_kobj, "security"); return retval; } #ifdef CONFIG_SECURITY lsm_dentry = securityfs_create_file("lsm", 0444, NULL, NULL, &lsm_ops); #endif return 0; } core_initcall(securityfs_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 // SPDX-License-Identifier: GPL-2.0-only /* * "security" table for IPv6 * * This is for use by Mandatory Access Control (MAC) security models, * which need to be able to manage security policy in separate context * to DAC. * * Based on iptable_mangle.c * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org> * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com> */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); #define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, }; static struct nf_hook_ops *sectbl_ops __read_mostly; static int ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; int ret; repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit ip6table_security_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "security"); } static struct pernet_operations ip6table_security_net_ops = { .pre_exit = ip6table_security_net_pre_exit, .exit = ip6table_security_net_exit, }; static int __init ip6table_security_init(void) { int ret = xt_register_template(&security_table, ip6table_security_table_init); if (ret < 0) return ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); if (IS_ERR(sectbl_ops)) { xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); } ret = register_pernet_subsys(&ip6table_security_net_ops); if (ret < 0) { kfree(sectbl_ops); xt_unregister_template(&security_table); return ret; } return ret; } static void __exit ip6table_security_fini(void) { unregister_pernet_subsys(&ip6table_security_net_ops); xt_unregister_template(&security_table); kfree(sectbl_ops); } module_init(ip6table_security_init); module_exit(ip6table_security_fini);
2 2 2 2 11 7 6 9 8 2 5 1 4 2 3 2 13 9 9 5 1 2 3 11 3 2 2 2 2 2 3 3 3 2 14 12 2 3 3 3 2 1 3 3 2 2 1 168 1 1 1 1 2 1 3 2 9 1 3 5 4 147 8 4 8 7 7 6 7 8 8 8 8 8 8 4 1 4 9 9 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 // SPDX-License-Identifier: GPL-2.0-only /* By Ross Biro 1/23/92 */ /* * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> #include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/proto.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> #include <asm/io_bitmap.h> #include "tls.h" enum x86_regset_32 { REGSET32_GENERAL, REGSET32_FP, REGSET32_XFP, REGSET32_XSTATE, REGSET32_TLS, REGSET32_IOPERM, }; enum x86_regset_64 { REGSET64_GENERAL, REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, REGSET64_SSP, }; #define REGSET_GENERAL \ ({ \ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ REGSET32_GENERAL; \ }) #define REGSET_FP \ ({ \ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ REGSET32_FP; \ }) struct pt_regs_offset { const char *name; int offset; }; #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} #define REG_OFFSET_END {.name = NULL, .offset = 0} static const struct pt_regs_offset regoffset_table[] = { #ifdef CONFIG_X86_64 REG_OFFSET_NAME(r15), REG_OFFSET_NAME(r14), REG_OFFSET_NAME(r13), REG_OFFSET_NAME(r12), REG_OFFSET_NAME(r11), REG_OFFSET_NAME(r10), REG_OFFSET_NAME(r9), REG_OFFSET_NAME(r8), #endif REG_OFFSET_NAME(bx), REG_OFFSET_NAME(cx), REG_OFFSET_NAME(dx), REG_OFFSET_NAME(si), REG_OFFSET_NAME(di), REG_OFFSET_NAME(bp), REG_OFFSET_NAME(ax), #ifdef CONFIG_X86_32 REG_OFFSET_NAME(ds), REG_OFFSET_NAME(es), REG_OFFSET_NAME(fs), REG_OFFSET_NAME(gs), #endif REG_OFFSET_NAME(orig_ax), REG_OFFSET_NAME(ip), REG_OFFSET_NAME(cs), REG_OFFSET_NAME(flags), REG_OFFSET_NAME(sp), REG_OFFSET_NAME(ss), REG_OFFSET_END, }; /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register * * regs_query_register_offset() returns the offset of a register in struct * pt_regs from its name. If the name is invalid, this returns -EINVAL; */ int regs_query_register_offset(const char *name) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (!strcmp(roff->name, name)) return roff->offset; return -EINVAL; } /** * regs_query_register_name() - query register name from its offset * @offset: the offset of a register in struct pt_regs. * * regs_query_register_name() returns the name of a register from its * offset in struct pt_regs. If the @offset is invalid, this returns NULL; */ const char *regs_query_register_name(unsigned int offset) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (roff->offset == offset) return roff->name; return NULL; } /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ /* * Determines which flags the user has access to [1 = access, 0 = no access]. */ #define FLAG_MASK_32 ((unsigned long) \ (X86_EFLAGS_CF | X86_EFLAGS_PF | \ X86_EFLAGS_AF | X86_EFLAGS_ZF | \ X86_EFLAGS_SF | X86_EFLAGS_TF | \ X86_EFLAGS_DF | X86_EFLAGS_OF | \ X86_EFLAGS_RF | X86_EFLAGS_AC)) /* * Determines whether a value may be installed in a segment register. */ static inline bool invalid_selector(u16 value) { return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); } #ifdef CONFIG_X86_32 #define FLAG_MASK FLAG_MASK_32 static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); return &regs->bx + (regno >> 2); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int retval; if (offset != offsetof(struct user_regs_struct, gs)) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) savesegment(gs, retval); else retval = task->thread.gs; } return retval; } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * For %cs and %ss we cannot permit a null selector. * We can permit a bogus selector as long as it has USER_RPL. * Null selectors are fine for other segment registers, but * we will never get back to user mode with invalid %cs or %ss * and will take the trap in iret instead. Much code relies * on user_mode() to distinguish a user trap frame (which can * safely use invalid selectors) from a kernel trap frame. */ switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; fallthrough; default: *pt_regs_access(task_pt_regs(task), offset) = value; break; case offsetof(struct user_regs_struct, gs): task->thread.gs = value; } return 0; } #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) { BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); return &regs->r15 + (offset / sizeof(regs->r15)); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int seg; switch (offset) { case offsetof(struct user_regs_struct, fs): if (task == current) { /* Older gas can't assemble movq %?s,%r?? */ asm("movl %%fs,%0" : "=r" (seg)); return seg; } return task->thread.fsindex; case offsetof(struct user_regs_struct, gs): if (task == current) { asm("movl %%gs,%0" : "=r" (seg)); return seg; } return task->thread.gsindex; case offsetof(struct user_regs_struct, ds): if (task == current) { asm("movl %%ds,%0" : "=r" (seg)); return seg; } return task->thread.ds; case offsetof(struct user_regs_struct, es): if (task == current) { asm("movl %%es,%0" : "=r" (seg)); return seg; } return task->thread.es; case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): break; } return *pt_regs_access(task_pt_regs(task), offset); } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * Writes to FS and GS will change the stored selector. Whether * this changes the segment base as well depends on whether * FSGSBASE is enabled. */ switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; break; case offsetof(struct user_regs_struct,es): task->thread.es = value; break; /* * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->ss = value; break; } return 0; } #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) { unsigned long retval = task_pt_regs(task)->flags; /* * If the debugger set TF, hide it from the readout. */ if (test_tsk_thread_flag(task, TIF_FORCED_TF)) retval &= ~X86_EFLAGS_TF; return retval; } static int set_flags(struct task_struct *task, unsigned long value) { struct pt_regs *regs = task_pt_regs(task); /* * If the user value contains TF, mark that * it was not "us" (the debugger) that set it. * If not, make sure it stays set if we had. */ if (value & X86_EFLAGS_TF) clear_tsk_thread_flag(task, TIF_FORCED_TF); else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) value |= X86_EFLAGS_TF; regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); return 0; } static int putreg(struct task_struct *child, unsigned long offset, unsigned long value) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return set_segment_reg(child, offset, value); case offsetof(struct user_regs_struct, flags): return set_flags(child, value); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_gsbase_write_task(child, value); return 0; #endif } *pt_regs_access(task_pt_regs(child), offset) = value; return 0; } static unsigned long getreg(struct task_struct *task, unsigned long offset) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return get_segment_reg(task, offset); case offsetof(struct user_regs_struct, flags): return get_flags(task); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): return x86_fsbase_read_task(task); case offsetof(struct user_regs_struct, gs_base): return x86_gsbase_read_task(task); #endif } return *pt_regs_access(task_pt_regs(task), offset); } static int genregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); return 0; } static int genregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const unsigned long *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) break; ret = putreg(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ for (i = 0; i < HBP_NUM; i++) { if (thread->ptrace_bps[i] == bp) break; } thread->virtual_dr6 |= (DR_TRAP0 << i); } /* * Walk through every ptrace breakpoints for this thread and * build the dr7 value on top of their attributes. * */ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { int i; int dr7 = 0; struct arch_hw_breakpoint *info; for (i = 0; i < HBP_NUM; i++) { if (bp[i] && !bp[i]->attr.disabled) { info = counter_arch_bp(bp[i]); dr7 |= encode_dr7(i, info->len, info->type); } } return dr7; } static int ptrace_fill_bp_fields(struct perf_event_attr *attr, int len, int type, bool disabled) { int err, bp_len, bp_type; err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); if (!err) { attr->bp_len = bp_len; attr->bp_type = bp_type; attr->disabled = disabled; } return err; } static struct perf_event * ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, unsigned long addr, bool disabled) { struct perf_event_attr attr; int err; ptrace_breakpoint_init(&attr); attr.bp_addr = addr; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return ERR_PTR(err); return register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, tsk); } static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int disabled) { struct perf_event_attr attr = bp->attr; int err; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return err; return modify_user_hw_breakpoint(bp, &attr); } /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; bool second_pass = false; int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; if (!bp) { if (disabled) continue; bp = ptrace_register_breakpoint(tsk, len, type, 0, disabled); if (IS_ERR(bp)) { rc = PTR_ERR(bp); break; } thread->ptrace_bps[i] = bp; continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } /* Restore if the first pass failed, second_pass shouldn't fail. */ if (rc && !WARN_ON(second_pass)) { ret = rc; data = old_dr7; second_pass = true; goto restore; } return ret; } /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { int index = array_index_nospec(n, HBP_NUM); struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; } else if (n == 6) { val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } return val; } static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { struct thread_struct *t = &tsk->thread; struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; if (!bp) { /* * Put stub len and type to create an inactive but correct bp. * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. * -EINVAL may be what we want for in-kernel breakpoints users, * but -EIO looks better for ptrace, since we refuse a register * writing for the user. And anyway this is the previous * behaviour. */ bp = ptrace_register_breakpoint(tsk, X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, addr, true); if (IS_ERR(bp)) err = PTR_ERR(bp); else t->ptrace_bps[nr] = bp; } else { struct perf_event_attr attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } return err; } /* * Handle PTRACE_POKEUSR calls for the debug register area. */ static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ int rc = -EIO; if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } return rc; } /* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { struct io_bitmap *iobm = target->thread.io_bitmap; return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct io_bitmap *iobm = target->thread.io_bitmap; if (!iobm) return -ENXIO; return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); } /* * Called by kernel/ptrace.c when detaching.. * * Make sure the single step bit is not set. */ void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif #ifdef CONFIG_X86_64 static const struct user_regset_view user_x86_64_view; /* Initialized below. */ #endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; #ifdef CONFIG_X86_64 /* This is native 64-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_64_view; #else /* This is native 32-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_32_view; #endif switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { unsigned long tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, datap); break; } case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, (struct user_desc __user *)data, 0); break; #endif #ifdef CONFIG_X86_64 /* normal 64bit interface to access TLS data. Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: ret = do_arch_prctl_64(child, data, addr); break; #endif default: ret = ptrace_request(child, request, addr, data); break; } return ret; } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> #include <linux/syscalls.h> #include <asm/ia32.h> #include <asm/user32.h> #define R32(l,q) \ case offsetof(struct user32, regs.l): \ regs->q = value; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ return set_segment_reg(child, \ offsetof(struct user_regs_struct, rs), \ value); \ break static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); /* * A 32-bit ptracer on a 64-bit kernel expects that writing * FS or GS will also update the base. This is needed for * operations like PTRACE_SETREGS to fully restore a saved * CPU state. */ case offsetof(struct user32, regs.fs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, fs), value); if (ret == 0) child->thread.fsbase = x86_fsgsbase_read_task(child, value); return ret; case offsetof(struct user32, regs.gs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, gs), value); if (ret == 0) child->thread.gsbase = x86_fsgsbase_read_task(child, value); return ret; SEG32(ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.orig_eax): /* * Warning: bizarre corner case fixup here. A 32-bit * debugger setting orig_eax to -1 wants to disable * syscall restart. Make sure that the syscall * restart code sign-extends orig_ax. Also make sure * we interpret the -ERESTART* codes correctly if * loaded into regs->ax in case the task is not * actually still sitting at the exit from a 32-bit * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): return set_flags(child, value); case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); return ptrace_set_debugreg(child, regno / 4, value); default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ break; } return 0; } #undef R32 #undef SEG32 #define R32(l,q) \ case offsetof(struct user32, regs.l): \ *val = regs->q; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ *val = get_segment_reg(child, \ offsetof(struct user_regs_struct, rs)); \ break static int getreg32(struct task_struct *child, unsigned regno, u32 *val) { struct pt_regs *regs = task_pt_regs(child); switch (regno) { SEG32(ds); SEG32(es); SEG32(fs); SEG32(gs); R32(cs, cs); R32(ss, ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.eflags): *val = get_flags(child); break; case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); *val = ptrace_get_debugreg(child, regno / 4); break; default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ *val = 0; break; } return 0; } #undef R32 #undef SEG32 static int genregs32_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) { u32 val; getreg32(target, reg * 4, &val); membuf_store(&to, val); } return 0; } static int genregs32_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) break; ret = putreg32(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; __u32 val; switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); if (ret == 0) ret = put_user(val, (__u32 __user *)datap); break; case PTRACE_POKEUSR: ret = putreg32(child, addr, data); break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user( child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: return arch_ptrace(child, request, addr, data); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif /* CONFIG_IA32_EMULATION */ #ifdef CONFIG_X86_X32_ABI static long x32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; switch (request) { /* Read 32bits at location addr in the USER area. Only allow to return the lower 32bits of segment and debug registers. */ case PTRACE_PEEKUSR: { u32 tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, (__u32 __user *)datap); break; } /* Write the word at location addr in the USER area. Only allow to update segment and debug registers with the upper 32bits zero-extended. */ case PTRACE_POKEUSR: ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif #ifdef CONFIG_COMPAT long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION return ia32_arch_ptrace(child, request, caddr, cdata); #else return 0; #endif } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET64_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .regset_get = genregs_get, .set = genregs_set }, [REGSET64_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET64_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET64_IOPERM] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), .active = ioperm_active, .regset_get = ioperm_get }, #ifdef CONFIG_X86_USER_SHADOW_STACK [REGSET64_SSP] = { .core_note_type = NT_X86_SHSTK, .n = 1, .size = sizeof(u64), .align = sizeof(u64), .active = ssp_active, .regset_get = ssp_get, .set = ssp_set }, #endif }; static const struct user_regset_view user_x86_64_view = { .name = "x86_64", .e_machine = EM_X86_64, .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) }; #else /* CONFIG_X86_32 */ #define user_regs_struct32 user_regs_struct #define genregs32_get genregs_get #define genregs32_set genregs_set #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET32_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = genregs32_get, .set = genregs32_set }, [REGSET32_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set }, [REGSET32_XFP] = { .core_note_type = NT_PRXFPREG, .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET32_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET32_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, .size = sizeof(struct user_desc), .align = sizeof(struct user_desc), .active = regset_tls_active, .regset_get = regset_tls_get, .set = regset_tls_set }, [REGSET32_IOPERM] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = ioperm_active, .regset_get = ioperm_get }, }; static const struct user_regset_view user_x86_32_view = { .name = "i386", .e_machine = EM_386, .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) }; #endif /* * This represents bytes 464..511 in the memory layout exported through * the REGSET_XSTATE interface. */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } /* * This is used by the core dump code to decide which regset to dump. The * core dump code writes out the resulting .e_machine and the corresponding * regsets. This is suboptimal if the task is messing around with its CS.L * field, but at worst the core dump will end up missing some information. * * Unfortunately, it is also used by the broken PTRACE_GETREGSET and * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have * no way to make sure that the e_machine they use matches the caller's * expectations. The result is that the data format returned by * PTRACE_GETREGSET depends on the returned CS field (and even the offset * of the returned CS field depends on its value!) and the data format * accepted by PTRACE_SETREGSET is determined by the old CS value. The * upshot is that it is basically impossible to use these APIs correctly. * * The best way to fix it in the long run would probably be to add new * improved ptrace() APIs to read and write registers reliably, possibly by * allowing userspace to select the ELF e_machine variant that they expect. */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; #endif #ifdef CONFIG_X86_64 return &user_x86_64_view; #endif } void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, si_code, user_mode(regs) ? (void __user *)regs->ip : NULL); } void user_single_step_report(struct pt_regs *regs) { send_sigtrap(regs, 0, TRAP_BRKPT); }
4507 164 4313 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BSEARCH_H #define _LINUX_BSEARCH_H #include <linux/types.h> static __always_inline void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { const char *pivot; int result; while (num > 0) { pivot = base + (num >> 1) * size; result = cmp(key, pivot); if (result == 0) return (void *)pivot; if (result > 0) { base = pivot + size; num--; } num >>= 1; } return NULL; } extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp); #endif /* _LINUX_BSEARCH_H */
24 24 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 /* * Copyright (c) 2004-2011 Atheros Communications Inc. * Copyright (c) 2011-2012 Qualcomm Atheros, Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "core.h" #include <linux/skbuff.h> #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/export.h> #include "debug.h" #include "target.h" struct ath6kl_fwlog_slot { __le32 timestamp; __le32 length; /* max ATH6KL_FWLOG_PAYLOAD_SIZE bytes */ u8 payload[]; }; #define ATH6KL_FWLOG_MAX_ENTRIES 20 #define ATH6KL_FWLOG_VALID_MASK 0x1ffff void ath6kl_printk(const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk("%sath6kl: %pV", level, &vaf); va_end(args); } EXPORT_SYMBOL(ath6kl_printk); void ath6kl_info(const char *fmt, ...) { struct va_format vaf = { .fmt = fmt, }; va_list args; va_start(args, fmt); vaf.va = &args; ath6kl_printk(KERN_INFO, "%pV", &vaf); trace_ath6kl_log_info(&vaf); va_end(args); } EXPORT_SYMBOL(ath6kl_info); void ath6kl_err(const char *fmt, ...) { struct va_format vaf = { .fmt = fmt, }; va_list args; va_start(args, fmt); vaf.va = &args; ath6kl_printk(KERN_ERR, "%pV", &vaf); trace_ath6kl_log_err(&vaf); va_end(args); } EXPORT_SYMBOL(ath6kl_err); void ath6kl_warn(const char *fmt, ...) { struct va_format vaf = { .fmt = fmt, }; va_list args; va_start(args, fmt); vaf.va = &args; ath6kl_printk(KERN_WARNING, "%pV", &vaf); trace_ath6kl_log_warn(&vaf); va_end(args); } EXPORT_SYMBOL(ath6kl_warn); int ath6kl_read_tgt_stats(struct ath6kl *ar, struct ath6kl_vif *vif) { long left; if (down_interruptible(&ar->sem)) return -EBUSY; set_bit(STATS_UPDATE_PEND, &vif->flags); if (ath6kl_wmi_get_stats_cmd(ar->wmi, 0)) { up(&ar->sem); return -EIO; } left = wait_event_interruptible_timeout(ar->event_wq, !test_bit(STATS_UPDATE_PEND, &vif->flags), WMI_TIMEOUT); up(&ar->sem); if (left <= 0) return -ETIMEDOUT; return 0; } EXPORT_SYMBOL(ath6kl_read_tgt_stats); #ifdef CONFIG_ATH6KL_DEBUG void ath6kl_dbg(enum ATH6K_DEBUG_MASK mask, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (debug_mask & mask) ath6kl_printk(KERN_DEBUG, "%pV", &vaf); trace_ath6kl_log_dbg(mask, &vaf); va_end(args); } EXPORT_SYMBOL(ath6kl_dbg); void ath6kl_dbg_dump(enum ATH6K_DEBUG_MASK mask, const char *msg, const char *prefix, const void *buf, size_t len) { if (debug_mask & mask) { if (msg) ath6kl_dbg(mask, "%s\n", msg); print_hex_dump_bytes(prefix, DUMP_PREFIX_OFFSET, buf, len); } /* tracing code doesn't like null strings :/ */ trace_ath6kl_log_dbg_dump(msg ? msg : "", prefix ? prefix : "", buf, len); } EXPORT_SYMBOL(ath6kl_dbg_dump); #define REG_OUTPUT_LEN_PER_LINE 25 #define REGTYPE_STR_LEN 100 struct ath6kl_diag_reg_info { u32 reg_start; u32 reg_end; const char *reg_info; }; static const struct ath6kl_diag_reg_info diag_reg[] = { { 0x20000, 0x200fc, "General DMA and Rx registers" }, { 0x28000, 0x28900, "MAC PCU register & keycache" }, { 0x20800, 0x20a40, "QCU" }, { 0x21000, 0x212f0, "DCU" }, { 0x4000, 0x42e4, "RTC" }, { 0x540000, 0x540000 + (256 * 1024), "RAM" }, { 0x29800, 0x2B210, "Base Band" }, { 0x1C000, 0x1C748, "Analog" }, }; void ath6kl_dump_registers(struct ath6kl_device *dev, struct ath6kl_irq_proc_registers *irq_proc_reg, struct ath6kl_irq_enable_reg *irq_enable_reg) { ath6kl_dbg(ATH6KL_DBG_IRQ, ("<------- Register Table -------->\n")); if (irq_proc_reg != NULL) { ath6kl_dbg(ATH6KL_DBG_IRQ, "Host Int status: 0x%x\n", irq_proc_reg->host_int_status); ath6kl_dbg(ATH6KL_DBG_IRQ, "CPU Int status: 0x%x\n", irq_proc_reg->cpu_int_status); ath6kl_dbg(ATH6KL_DBG_IRQ, "Error Int status: 0x%x\n", irq_proc_reg->error_int_status); ath6kl_dbg(ATH6KL_DBG_IRQ, "Counter Int status: 0x%x\n", irq_proc_reg->counter_int_status); ath6kl_dbg(ATH6KL_DBG_IRQ, "Mbox Frame: 0x%x\n", irq_proc_reg->mbox_frame); ath6kl_dbg(ATH6KL_DBG_IRQ, "Rx Lookahead Valid: 0x%x\n", irq_proc_reg->rx_lkahd_valid); ath6kl_dbg(ATH6KL_DBG_IRQ, "Rx Lookahead 0: 0x%x\n", irq_proc_reg->rx_lkahd[0]); ath6kl_dbg(ATH6KL_DBG_IRQ, "Rx Lookahead 1: 0x%x\n", irq_proc_reg->rx_lkahd[1]); if (dev->ar->mbox_info.gmbox_addr != 0) { /* * If the target supports GMBOX hardware, dump some * additional state. */ ath6kl_dbg(ATH6KL_DBG_IRQ, "GMBOX Host Int status 2: 0x%x\n", irq_proc_reg->host_int_status2); ath6kl_dbg(ATH6KL_DBG_IRQ, "GMBOX RX Avail: 0x%x\n", irq_proc_reg->gmbox_rx_avail); ath6kl_dbg(ATH6KL_DBG_IRQ, "GMBOX lookahead alias 0: 0x%x\n", irq_proc_reg->rx_gmbox_lkahd_alias[0]); ath6kl_dbg(ATH6KL_DBG_IRQ, "GMBOX lookahead alias 1: 0x%x\n", irq_proc_reg->rx_gmbox_lkahd_alias[1]); } } if (irq_enable_reg != NULL) { ath6kl_dbg(ATH6KL_DBG_IRQ, "Int status Enable: 0x%x\n", irq_enable_reg->int_status_en); ath6kl_dbg(ATH6KL_DBG_IRQ, "Counter Int status Enable: 0x%x\n", irq_enable_reg->cntr_int_status_en); } ath6kl_dbg(ATH6KL_DBG_IRQ, "<------------------------------->\n"); } static void dump_cred_dist(struct htc_endpoint_credit_dist *ep_dist) { ath6kl_dbg(ATH6KL_DBG_CREDIT, "--- endpoint: %d svc_id: 0x%X ---\n", ep_dist->endpoint, ep_dist->svc_id); ath6kl_dbg(ATH6KL_DBG_CREDIT, " dist_flags : 0x%X\n", ep_dist->dist_flags); ath6kl_dbg(ATH6KL_DBG_CREDIT, " cred_norm : %d\n", ep_dist->cred_norm); ath6kl_dbg(ATH6KL_DBG_CREDIT, " cred_min : %d\n", ep_dist->cred_min); ath6kl_dbg(ATH6KL_DBG_CREDIT, " credits : %d\n", ep_dist->credits); ath6kl_dbg(ATH6KL_DBG_CREDIT, " cred_assngd : %d\n", ep_dist->cred_assngd); ath6kl_dbg(ATH6KL_DBG_CREDIT, " seek_cred : %d\n", ep_dist->seek_cred); ath6kl_dbg(ATH6KL_DBG_CREDIT, " cred_sz : %d\n", ep_dist->cred_sz); ath6kl_dbg(ATH6KL_DBG_CREDIT, " cred_per_msg : %d\n", ep_dist->cred_per_msg); ath6kl_dbg(ATH6KL_DBG_CREDIT, " cred_to_dist : %d\n", ep_dist->cred_to_dist); ath6kl_dbg(ATH6KL_DBG_CREDIT, " txq_depth : %d\n", get_queue_depth(&ep_dist->htc_ep->txq)); ath6kl_dbg(ATH6KL_DBG_CREDIT, "----------------------------------\n"); } /* FIXME: move to htc.c */ void dump_cred_dist_stats(struct htc_target *target) { struct htc_endpoint_credit_dist *ep_list; list_for_each_entry(ep_list, &target->cred_dist_list, list) dump_cred_dist(ep_list); ath6kl_dbg(ATH6KL_DBG_CREDIT, "credit distribution total %d free %d\n", target->credit_info->total_avail_credits, target->credit_info->cur_free_credits); } void ath6kl_debug_war(struct ath6kl *ar, enum ath6kl_war war) { switch (war) { case ATH6KL_WAR_INVALID_RATE: ar->debug.war_stats.invalid_rate++; break; } } static ssize_t read_file_war_stats(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; char *buf; unsigned int len = 0, buf_len = 1500; ssize_t ret_cnt; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) return -ENOMEM; len += scnprintf(buf + len, buf_len - len, "\n"); len += scnprintf(buf + len, buf_len - len, "%25s\n", "Workaround stats"); len += scnprintf(buf + len, buf_len - len, "%25s\n\n", "================="); len += scnprintf(buf + len, buf_len - len, "%20s %10u\n", "Invalid rates", ar->debug.war_stats.invalid_rate); if (WARN_ON(len > buf_len)) len = buf_len; ret_cnt = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); return ret_cnt; } static const struct file_operations fops_war_stats = { .read = read_file_war_stats, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; void ath6kl_debug_fwlog_event(struct ath6kl *ar, const void *buf, size_t len) { struct ath6kl_fwlog_slot *slot; struct sk_buff *skb; size_t slot_len; if (WARN_ON(len > ATH6KL_FWLOG_PAYLOAD_SIZE)) return; slot_len = sizeof(*slot) + ATH6KL_FWLOG_PAYLOAD_SIZE; skb = alloc_skb(slot_len, GFP_KERNEL); if (!skb) return; slot = skb_put(skb, slot_len); slot->timestamp = cpu_to_le32(jiffies); slot->length = cpu_to_le32(len); memcpy(slot->payload, buf, len); /* Need to pad each record to fixed length ATH6KL_FWLOG_PAYLOAD_SIZE */ memset(slot->payload + len, 0, ATH6KL_FWLOG_PAYLOAD_SIZE - len); spin_lock(&ar->debug.fwlog_queue.lock); __skb_queue_tail(&ar->debug.fwlog_queue, skb); complete(&ar->debug.fwlog_completion); /* drop oldest entries */ while (skb_queue_len(&ar->debug.fwlog_queue) > ATH6KL_FWLOG_MAX_ENTRIES) { skb = __skb_dequeue(&ar->debug.fwlog_queue); kfree_skb(skb); } spin_unlock(&ar->debug.fwlog_queue.lock); return; } static int ath6kl_fwlog_open(struct inode *inode, struct file *file) { struct ath6kl *ar = inode->i_private; if (ar->debug.fwlog_open) return -EBUSY; ar->debug.fwlog_open = true; file->private_data = inode->i_private; return 0; } static int ath6kl_fwlog_release(struct inode *inode, struct file *file) { struct ath6kl *ar = inode->i_private; ar->debug.fwlog_open = false; return 0; } static ssize_t ath6kl_fwlog_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct sk_buff *skb; ssize_t ret_cnt; size_t len = 0; char *buf; buf = vmalloc(count); if (!buf) return -ENOMEM; /* read undelivered logs from firmware */ ath6kl_read_fwlogs(ar); spin_lock(&ar->debug.fwlog_queue.lock); while ((skb = __skb_dequeue(&ar->debug.fwlog_queue))) { if (skb->len > count - len) { /* not enough space, put skb back and leave */ __skb_queue_head(&ar->debug.fwlog_queue, skb); break; } memcpy(buf + len, skb->data, skb->len); len += skb->len; kfree_skb(skb); } spin_unlock(&ar->debug.fwlog_queue.lock); /* FIXME: what to do if len == 0? */ ret_cnt = simple_read_from_buffer(user_buf, count, ppos, buf, len); vfree(buf); return ret_cnt; } static const struct file_operations fops_fwlog = { .open = ath6kl_fwlog_open, .release = ath6kl_fwlog_release, .read = ath6kl_fwlog_read, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_fwlog_block_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct sk_buff *skb; ssize_t ret_cnt; size_t len = 0, not_copied; char *buf; int ret; buf = vmalloc(count); if (!buf) return -ENOMEM; spin_lock(&ar->debug.fwlog_queue.lock); if (skb_queue_len(&ar->debug.fwlog_queue) == 0) { /* we must init under queue lock */ init_completion(&ar->debug.fwlog_completion); spin_unlock(&ar->debug.fwlog_queue.lock); ret = wait_for_completion_interruptible( &ar->debug.fwlog_completion); if (ret == -ERESTARTSYS) { vfree(buf); return ret; } spin_lock(&ar->debug.fwlog_queue.lock); } while ((skb = __skb_dequeue(&ar->debug.fwlog_queue))) { if (skb->len > count - len) { /* not enough space, put skb back and leave */ __skb_queue_head(&ar->debug.fwlog_queue, skb); break; } memcpy(buf + len, skb->data, skb->len); len += skb->len; kfree_skb(skb); } spin_unlock(&ar->debug.fwlog_queue.lock); /* FIXME: what to do if len == 0? */ not_copied = copy_to_user(user_buf, buf, len); if (not_copied != 0) { ret_cnt = -EFAULT; goto out; } *ppos = *ppos + len; ret_cnt = len; out: vfree(buf); return ret_cnt; } static const struct file_operations fops_fwlog_block = { .open = ath6kl_fwlog_open, .release = ath6kl_fwlog_release, .read = ath6kl_fwlog_block_read, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_fwlog_mask_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; char buf[16]; int len; len = snprintf(buf, sizeof(buf), "0x%x\n", ar->debug.fwlog_mask); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t ath6kl_fwlog_mask_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; int ret; ret = kstrtou32_from_user(user_buf, count, 0, &ar->debug.fwlog_mask); if (ret) return ret; ret = ath6kl_wmi_config_debug_module_cmd(ar->wmi, ATH6KL_FWLOG_VALID_MASK, ar->debug.fwlog_mask); if (ret) return ret; return count; } static const struct file_operations fops_fwlog_mask = { .open = simple_open, .read = ath6kl_fwlog_mask_read, .write = ath6kl_fwlog_mask_write, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t read_file_tgt_stats(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct ath6kl_vif *vif; struct target_stats *tgt_stats; char *buf; unsigned int len = 0, buf_len = 1500; int i; ssize_t ret_cnt; int rv; vif = ath6kl_vif_first(ar); if (!vif) return -EIO; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) return -ENOMEM; rv = ath6kl_read_tgt_stats(ar, vif); if (rv < 0) { kfree(buf); return rv; } tgt_stats = &vif->target_stats; len += scnprintf(buf + len, buf_len - len, "\n"); len += scnprintf(buf + len, buf_len - len, "%25s\n", "Target Tx stats"); len += scnprintf(buf + len, buf_len - len, "%25s\n\n", "================="); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Ucast packets", tgt_stats->tx_ucast_pkt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Bcast packets", tgt_stats->tx_bcast_pkt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Ucast byte", tgt_stats->tx_ucast_byte); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Bcast byte", tgt_stats->tx_bcast_byte); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Rts success cnt", tgt_stats->tx_rts_success_cnt); for (i = 0; i < 4; i++) len += scnprintf(buf + len, buf_len - len, "%18s %d %10llu\n", "PER on ac", i, tgt_stats->tx_pkt_per_ac[i]); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Error", tgt_stats->tx_err); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Fail count", tgt_stats->tx_fail_cnt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Retry count", tgt_stats->tx_retry_cnt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Multi retry cnt", tgt_stats->tx_mult_retry_cnt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Rts fail cnt", tgt_stats->tx_rts_fail_cnt); len += scnprintf(buf + len, buf_len - len, "%25s %10llu\n\n", "TKIP counter measure used", tgt_stats->tkip_cnter_measures_invoked); len += scnprintf(buf + len, buf_len - len, "%25s\n", "Target Rx stats"); len += scnprintf(buf + len, buf_len - len, "%25s\n", "================="); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Ucast packets", tgt_stats->rx_ucast_pkt); len += scnprintf(buf + len, buf_len - len, "%20s %10d\n", "Ucast Rate", tgt_stats->rx_ucast_rate); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Bcast packets", tgt_stats->rx_bcast_pkt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Ucast byte", tgt_stats->rx_ucast_byte); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Bcast byte", tgt_stats->rx_bcast_byte); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Fragmented pkt", tgt_stats->rx_frgment_pkt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Error", tgt_stats->rx_err); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "CRC Err", tgt_stats->rx_crc_err); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Key cache miss", tgt_stats->rx_key_cache_miss); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Decrypt Err", tgt_stats->rx_decrypt_err); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Duplicate frame", tgt_stats->rx_dupl_frame); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Tkip Mic failure", tgt_stats->tkip_local_mic_fail); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "TKIP format err", tgt_stats->tkip_fmt_err); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "CCMP format Err", tgt_stats->ccmp_fmt_err); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n\n", "CCMP Replay Err", tgt_stats->ccmp_replays); len += scnprintf(buf + len, buf_len - len, "%25s\n", "Misc Target stats"); len += scnprintf(buf + len, buf_len - len, "%25s\n", "================="); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Beacon Miss count", tgt_stats->cs_bmiss_cnt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Num Connects", tgt_stats->cs_connect_cnt); len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n", "Num disconnects", tgt_stats->cs_discon_cnt); len += scnprintf(buf + len, buf_len - len, "%20s %10d\n", "Beacon avg rssi", tgt_stats->cs_ave_beacon_rssi); len += scnprintf(buf + len, buf_len - len, "%20s %10d\n", "ARP pkt received", tgt_stats->arp_received); len += scnprintf(buf + len, buf_len - len, "%20s %10d\n", "ARP pkt matched", tgt_stats->arp_matched); len += scnprintf(buf + len, buf_len - len, "%20s %10d\n", "ARP pkt replied", tgt_stats->arp_replied); if (len > buf_len) len = buf_len; ret_cnt = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); return ret_cnt; } static const struct file_operations fops_tgt_stats = { .read = read_file_tgt_stats, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; #define print_credit_info(fmt_str, ep_list_field) \ (len += scnprintf(buf + len, buf_len - len, fmt_str, \ ep_list->ep_list_field)) #define CREDIT_INFO_DISPLAY_STRING_LEN 200 #define CREDIT_INFO_LEN 128 static ssize_t read_file_credit_dist_stats(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct htc_target *target = ar->htc_target; struct htc_endpoint_credit_dist *ep_list; char *buf; unsigned int buf_len, len = 0; ssize_t ret_cnt; buf_len = CREDIT_INFO_DISPLAY_STRING_LEN + get_queue_depth(&target->cred_dist_list) * CREDIT_INFO_LEN; buf = kzalloc(buf_len, GFP_KERNEL); if (!buf) return -ENOMEM; len += scnprintf(buf + len, buf_len - len, "%25s%5d\n", "Total Avail Credits: ", target->credit_info->total_avail_credits); len += scnprintf(buf + len, buf_len - len, "%25s%5d\n", "Free credits :", target->credit_info->cur_free_credits); len += scnprintf(buf + len, buf_len - len, " Epid Flags Cred_norm Cred_min Credits Cred_assngd" " Seek_cred Cred_sz Cred_per_msg Cred_to_dist" " qdepth\n"); list_for_each_entry(ep_list, &target->cred_dist_list, list) { print_credit_info(" %2d", endpoint); print_credit_info("%10x", dist_flags); print_credit_info("%8d", cred_norm); print_credit_info("%9d", cred_min); print_credit_info("%9d", credits); print_credit_info("%10d", cred_assngd); print_credit_info("%13d", seek_cred); print_credit_info("%12d", cred_sz); print_credit_info("%9d", cred_per_msg); print_credit_info("%14d", cred_to_dist); len += scnprintf(buf + len, buf_len - len, "%12d\n", get_queue_depth(&ep_list->htc_ep->txq)); } if (len > buf_len) len = buf_len; ret_cnt = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); return ret_cnt; } static const struct file_operations fops_credit_dist_stats = { .read = read_file_credit_dist_stats, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static unsigned int print_endpoint_stat(struct htc_target *target, char *buf, unsigned int buf_len, unsigned int len, int offset, const char *name) { int i; struct htc_endpoint_stats *ep_st; u32 *counter; len += scnprintf(buf + len, buf_len - len, "%s:", name); for (i = 0; i < ENDPOINT_MAX; i++) { ep_st = &target->endpoint[i].ep_st; counter = ((u32 *) ep_st) + (offset / 4); len += scnprintf(buf + len, buf_len - len, " %u", *counter); } len += scnprintf(buf + len, buf_len - len, "\n"); return len; } static ssize_t ath6kl_endpoint_stats_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct htc_target *target = ar->htc_target; char *buf; unsigned int buf_len, len = 0; ssize_t ret_cnt; buf_len = sizeof(struct htc_endpoint_stats) / sizeof(u32) * (25 + ENDPOINT_MAX * 11); buf = kmalloc(buf_len, GFP_KERNEL); if (!buf) return -ENOMEM; #define EPSTAT(name) \ do { \ len = print_endpoint_stat(target, buf, buf_len, len, \ offsetof(struct htc_endpoint_stats, \ name), \ #name); \ } while (0) EPSTAT(cred_low_indicate); EPSTAT(tx_issued); EPSTAT(tx_pkt_bundled); EPSTAT(tx_bundles); EPSTAT(tx_dropped); EPSTAT(tx_cred_rpt); EPSTAT(cred_rpt_from_rx); EPSTAT(cred_rpt_from_other); EPSTAT(cred_rpt_ep0); EPSTAT(cred_from_rx); EPSTAT(cred_from_other); EPSTAT(cred_from_ep0); EPSTAT(cred_cosumd); EPSTAT(cred_retnd); EPSTAT(rx_pkts); EPSTAT(rx_lkahds); EPSTAT(rx_bundl); EPSTAT(rx_bundle_lkahd); EPSTAT(rx_bundle_from_hdr); EPSTAT(rx_alloc_thresh_hit); EPSTAT(rxalloc_thresh_byte); #undef EPSTAT if (len > buf_len) len = buf_len; ret_cnt = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); return ret_cnt; } static ssize_t ath6kl_endpoint_stats_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct htc_target *target = ar->htc_target; int ret, i; u32 val; struct htc_endpoint_stats *ep_st; ret = kstrtou32_from_user(user_buf, count, 0, &val); if (ret) return ret; if (val == 0) { for (i = 0; i < ENDPOINT_MAX; i++) { ep_st = &target->endpoint[i].ep_st; memset(ep_st, 0, sizeof(*ep_st)); } } return count; } static const struct file_operations fops_endpoint_stats = { .open = simple_open, .read = ath6kl_endpoint_stats_read, .write = ath6kl_endpoint_stats_write, .owner = THIS_MODULE, .llseek = default_llseek, }; static unsigned long ath6kl_get_num_reg(void) { int i; unsigned long n_reg = 0; for (i = 0; i < ARRAY_SIZE(diag_reg); i++) n_reg = n_reg + (diag_reg[i].reg_end - diag_reg[i].reg_start) / 4 + 1; return n_reg; } static bool ath6kl_dbg_is_diag_reg_valid(u32 reg_addr) { int i; for (i = 0; i < ARRAY_SIZE(diag_reg); i++) { if (reg_addr >= diag_reg[i].reg_start && reg_addr <= diag_reg[i].reg_end) return true; } return false; } static ssize_t ath6kl_regread_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; u8 buf[50]; unsigned int len = 0; if (ar->debug.dbgfs_diag_reg) len += scnprintf(buf + len, sizeof(buf) - len, "0x%x\n", ar->debug.dbgfs_diag_reg); else len += scnprintf(buf + len, sizeof(buf) - len, "All diag registers\n"); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t ath6kl_regread_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; unsigned long reg_addr; if (kstrtoul_from_user(user_buf, count, 0, &reg_addr)) return -EINVAL; if ((reg_addr % 4) != 0) return -EINVAL; if (reg_addr && !ath6kl_dbg_is_diag_reg_valid(reg_addr)) return -EINVAL; ar->debug.dbgfs_diag_reg = reg_addr; return count; } static const struct file_operations fops_diag_reg_read = { .read = ath6kl_regread_read, .write = ath6kl_regread_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static int ath6kl_regdump_open(struct inode *inode, struct file *file) { struct ath6kl *ar = inode->i_private; u8 *buf; unsigned long int reg_len; unsigned int len = 0, n_reg; u32 addr; __le32 reg_val; int i, status; /* Dump all the registers if no register is specified */ if (!ar->debug.dbgfs_diag_reg) n_reg = ath6kl_get_num_reg(); else n_reg = 1; reg_len = n_reg * REG_OUTPUT_LEN_PER_LINE; if (n_reg > 1) reg_len += REGTYPE_STR_LEN; buf = vmalloc(reg_len); if (!buf) return -ENOMEM; if (n_reg == 1) { addr = ar->debug.dbgfs_diag_reg; status = ath6kl_diag_read32(ar, TARG_VTOP(ar->target_type, addr), (u32 *)&reg_val); if (status) goto fail_reg_read; len += scnprintf(buf + len, reg_len - len, "0x%06x 0x%08x\n", addr, le32_to_cpu(reg_val)); goto done; } for (i = 0; i < ARRAY_SIZE(diag_reg); i++) { len += scnprintf(buf + len, reg_len - len, "%s\n", diag_reg[i].reg_info); for (addr = diag_reg[i].reg_start; addr <= diag_reg[i].reg_end; addr += 4) { status = ath6kl_diag_read32(ar, TARG_VTOP(ar->target_type, addr), (u32 *)&reg_val); if (status) goto fail_reg_read; len += scnprintf(buf + len, reg_len - len, "0x%06x 0x%08x\n", addr, le32_to_cpu(reg_val)); } } done: file->private_data = buf; return 0; fail_reg_read: ath6kl_warn("Unable to read memory:%u\n", addr); vfree(buf); return -EIO; } static ssize_t ath6kl_regdump_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { u8 *buf = file->private_data; return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } static int ath6kl_regdump_release(struct inode *inode, struct file *file) { vfree(file->private_data); return 0; } static const struct file_operations fops_reg_dump = { .open = ath6kl_regdump_open, .read = ath6kl_regdump_read, .release = ath6kl_regdump_release, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_lrssi_roam_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; unsigned long lrssi_roam_threshold; int ret; if (kstrtoul_from_user(user_buf, count, 0, &lrssi_roam_threshold)) return -EINVAL; ar->lrssi_roam_threshold = lrssi_roam_threshold; ret = ath6kl_wmi_set_roam_lrssi_cmd(ar->wmi, ar->lrssi_roam_threshold); if (ret) return ret; return count; } static ssize_t ath6kl_lrssi_roam_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; char buf[32]; unsigned int len; len = snprintf(buf, sizeof(buf), "%u\n", ar->lrssi_roam_threshold); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static const struct file_operations fops_lrssi_roam_threshold = { .read = ath6kl_lrssi_roam_read, .write = ath6kl_lrssi_roam_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_regwrite_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; u8 buf[32]; unsigned int len = 0; len = scnprintf(buf, sizeof(buf), "Addr: 0x%x Val: 0x%x\n", ar->debug.diag_reg_addr_wr, ar->debug.diag_reg_val_wr); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t ath6kl_regwrite_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; char buf[32]; char *sptr, *token; unsigned int len = 0; u32 reg_addr, reg_val; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; sptr = buf; token = strsep(&sptr, "="); if (!token) return -EINVAL; if (kstrtou32(token, 0, &reg_addr)) return -EINVAL; if (!ath6kl_dbg_is_diag_reg_valid(reg_addr)) return -EINVAL; if (kstrtou32(sptr, 0, &reg_val)) return -EINVAL; ar->debug.diag_reg_addr_wr = reg_addr; ar->debug.diag_reg_val_wr = reg_val; if (ath6kl_diag_write32(ar, ar->debug.diag_reg_addr_wr, cpu_to_le32(ar->debug.diag_reg_val_wr))) return -EIO; return count; } static const struct file_operations fops_diag_reg_write = { .read = ath6kl_regwrite_read, .write = ath6kl_regwrite_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; int ath6kl_debug_roam_tbl_event(struct ath6kl *ar, const void *buf, size_t len) { const struct wmi_target_roam_tbl *tbl; u16 num_entries; if (len < sizeof(*tbl)) return -EINVAL; tbl = (const struct wmi_target_roam_tbl *) buf; num_entries = le16_to_cpu(tbl->num_entries); if (struct_size(tbl, info, num_entries) > len) return -EINVAL; if (ar->debug.roam_tbl == NULL || ar->debug.roam_tbl_len < (unsigned int) len) { kfree(ar->debug.roam_tbl); ar->debug.roam_tbl = kmalloc(len, GFP_ATOMIC); if (ar->debug.roam_tbl == NULL) return -ENOMEM; } memcpy(ar->debug.roam_tbl, buf, len); ar->debug.roam_tbl_len = len; if (test_bit(ROAM_TBL_PEND, &ar->flag)) { clear_bit(ROAM_TBL_PEND, &ar->flag); wake_up(&ar->event_wq); } return 0; } static ssize_t ath6kl_roam_table_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; int ret; long left; struct wmi_target_roam_tbl *tbl; u16 num_entries, i; char *buf; unsigned int len, buf_len; ssize_t ret_cnt; if (down_interruptible(&ar->sem)) return -EBUSY; set_bit(ROAM_TBL_PEND, &ar->flag); ret = ath6kl_wmi_get_roam_tbl_cmd(ar->wmi); if (ret) { up(&ar->sem); return ret; } left = wait_event_interruptible_timeout( ar->event_wq, !test_bit(ROAM_TBL_PEND, &ar->flag), WMI_TIMEOUT); up(&ar->sem); if (left <= 0) return -ETIMEDOUT; if (ar->debug.roam_tbl == NULL) return -ENOMEM; tbl = (struct wmi_target_roam_tbl *) ar->debug.roam_tbl; num_entries = le16_to_cpu(tbl->num_entries); buf_len = 100 + num_entries * 100; buf = kzalloc(buf_len, GFP_KERNEL); if (buf == NULL) return -ENOMEM; len = 0; len += scnprintf(buf + len, buf_len - len, "roam_mode=%u\n\n" "# roam_util bssid rssi rssidt last_rssi util bias\n", le16_to_cpu(tbl->roam_mode)); for (i = 0; i < num_entries; i++) { struct wmi_bss_roam_info *info = &tbl->info[i]; len += scnprintf(buf + len, buf_len - len, "%d %pM %d %d %d %d %d\n", a_sle32_to_cpu(info->roam_util), info->bssid, info->rssi, info->rssidt, info->last_rssi, info->util, info->bias); } if (len > buf_len) len = buf_len; ret_cnt = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); return ret_cnt; } static const struct file_operations fops_roam_table = { .read = ath6kl_roam_table_read, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_force_roam_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; int ret; char buf[20]; size_t len; u8 bssid[ETH_ALEN]; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; if (!mac_pton(buf, bssid)) return -EINVAL; ret = ath6kl_wmi_force_roam_cmd(ar->wmi, bssid); if (ret) return ret; return count; } static const struct file_operations fops_force_roam = { .write = ath6kl_force_roam_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_roam_mode_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; int ret; char buf[20]; size_t len; enum wmi_roam_mode mode; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; if (len > 0 && buf[len - 1] == '\n') buf[len - 1] = '\0'; if (strcasecmp(buf, "default") == 0) mode = WMI_DEFAULT_ROAM_MODE; else if (strcasecmp(buf, "bssbias") == 0) mode = WMI_HOST_BIAS_ROAM_MODE; else if (strcasecmp(buf, "lock") == 0) mode = WMI_LOCK_BSS_MODE; else return -EINVAL; ret = ath6kl_wmi_set_roam_mode_cmd(ar->wmi, mode); if (ret) return ret; return count; } static const struct file_operations fops_roam_mode = { .write = ath6kl_roam_mode_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; void ath6kl_debug_set_keepalive(struct ath6kl *ar, u8 keepalive) { ar->debug.keepalive = keepalive; } static ssize_t ath6kl_keepalive_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; char buf[16]; int len; len = snprintf(buf, sizeof(buf), "%u\n", ar->debug.keepalive); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t ath6kl_keepalive_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; int ret; u8 val; ret = kstrtou8_from_user(user_buf, count, 0, &val); if (ret) return ret; ret = ath6kl_wmi_set_keepalive_cmd(ar->wmi, 0, val); if (ret) return ret; return count; } static const struct file_operations fops_keepalive = { .open = simple_open, .read = ath6kl_keepalive_read, .write = ath6kl_keepalive_write, .owner = THIS_MODULE, .llseek = default_llseek, }; void ath6kl_debug_set_disconnect_timeout(struct ath6kl *ar, u8 timeout) { ar->debug.disc_timeout = timeout; } static ssize_t ath6kl_disconnect_timeout_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; char buf[16]; int len; len = snprintf(buf, sizeof(buf), "%u\n", ar->debug.disc_timeout); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t ath6kl_disconnect_timeout_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; int ret; u8 val; ret = kstrtou8_from_user(user_buf, count, 0, &val); if (ret) return ret; ret = ath6kl_wmi_disctimeout_cmd(ar->wmi, 0, val); if (ret) return ret; return count; } static const struct file_operations fops_disconnect_timeout = { .open = simple_open, .read = ath6kl_disconnect_timeout_read, .write = ath6kl_disconnect_timeout_write, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_create_qos_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct ath6kl_vif *vif; char buf[200]; ssize_t len; char *sptr, *token; struct wmi_create_pstream_cmd pstream; u32 val32; u16 val16; vif = ath6kl_vif_first(ar); if (!vif) return -EIO; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; sptr = buf; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &pstream.user_pri)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &pstream.traffic_direc)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &pstream.traffic_class)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &pstream.traffic_type)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &pstream.voice_psc_cap)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.min_service_int = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.max_service_int = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.inactivity_int = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.suspension_int = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.service_start_time = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &pstream.tsid)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &val16)) return -EINVAL; pstream.nominal_msdu = cpu_to_le16(val16); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &val16)) return -EINVAL; pstream.max_msdu = cpu_to_le16(val16); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.min_data_rate = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.mean_data_rate = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.peak_data_rate = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.max_burst_size = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.delay_bound = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.min_phy_rate = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.sba = cpu_to_le32(val32); token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou32(token, 0, &val32)) return -EINVAL; pstream.medium_time = cpu_to_le32(val32); pstream.nominal_phy = le32_to_cpu(pstream.min_phy_rate) / 1000000; ath6kl_wmi_create_pstream_cmd(ar->wmi, vif->fw_vif_idx, &pstream); return count; } static const struct file_operations fops_create_qos = { .write = ath6kl_create_qos_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_delete_qos_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct ath6kl_vif *vif; char buf[100]; ssize_t len; char *sptr, *token; u8 traffic_class; u8 tsid; vif = ath6kl_vif_first(ar); if (!vif) return -EIO; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; sptr = buf; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &traffic_class)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou8(token, 0, &tsid)) return -EINVAL; ath6kl_wmi_delete_pstream_cmd(ar->wmi, vif->fw_vif_idx, traffic_class, tsid); return count; } static const struct file_operations fops_delete_qos = { .write = ath6kl_delete_qos_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_bgscan_int_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct ath6kl_vif *vif; u16 bgscan_int; char buf[32]; ssize_t len; vif = ath6kl_vif_first(ar); if (!vif) return -EIO; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; if (kstrtou16(buf, 0, &bgscan_int)) return -EINVAL; if (bgscan_int == 0) bgscan_int = 0xffff; vif->bg_scan_period = bgscan_int; ath6kl_wmi_scanparams_cmd(ar->wmi, 0, 0, 0, bgscan_int, 0, 0, 0, 3, 0, 0, 0); return count; } static const struct file_operations fops_bgscan_int = { .write = ath6kl_bgscan_int_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_listen_int_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct ath6kl_vif *vif; u16 listen_interval; char buf[32]; ssize_t len; vif = ath6kl_vif_first(ar); if (!vif) return -EIO; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; if (kstrtou16(buf, 0, &listen_interval)) return -EINVAL; if ((listen_interval < 15) || (listen_interval > 3000)) return -EINVAL; vif->listen_intvl_t = listen_interval; ath6kl_wmi_listeninterval_cmd(ar->wmi, vif->fw_vif_idx, vif->listen_intvl_t, 0); return count; } static ssize_t ath6kl_listen_int_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; struct ath6kl_vif *vif; char buf[32]; int len; vif = ath6kl_vif_first(ar); if (!vif) return -EIO; len = scnprintf(buf, sizeof(buf), "%u\n", vif->listen_intvl_t); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static const struct file_operations fops_listen_int = { .read = ath6kl_listen_int_read, .write = ath6kl_listen_int_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; static ssize_t ath6kl_power_params_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ath6kl *ar = file->private_data; u8 buf[100]; unsigned int len = 0; char *sptr, *token; u16 idle_period, ps_poll_num, dtim, tx_wakeup, num_tx; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; sptr = buf; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &idle_period)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &ps_poll_num)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &dtim)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &tx_wakeup)) return -EINVAL; token = strsep(&sptr, " "); if (!token) return -EINVAL; if (kstrtou16(token, 0, &num_tx)) return -EINVAL; ath6kl_wmi_pmparams_cmd(ar->wmi, 0, idle_period, ps_poll_num, dtim, tx_wakeup, num_tx, 0); return count; } static const struct file_operations fops_power_params = { .write = ath6kl_power_params_write, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; void ath6kl_debug_init(struct ath6kl *ar) { skb_queue_head_init(&ar->debug.fwlog_queue); init_completion(&ar->debug.fwlog_completion); /* * Actually we are lying here but don't know how to read the mask * value from the firmware. */ ar->debug.fwlog_mask = 0; } /* * Initialisation needs to happen in two stages as fwlog events can come * before cfg80211 is initialised, and debugfs depends on cfg80211 * initialisation. */ int ath6kl_debug_init_fs(struct ath6kl *ar) { ar->debugfs_phy = debugfs_create_dir("ath6kl", ar->wiphy->debugfsdir); debugfs_create_file("tgt_stats", 0400, ar->debugfs_phy, ar, &fops_tgt_stats); if (ar->hif_type == ATH6KL_HIF_TYPE_SDIO) debugfs_create_file("credit_dist_stats", 0400, ar->debugfs_phy, ar, &fops_credit_dist_stats); debugfs_create_file("endpoint_stats", 0600, ar->debugfs_phy, ar, &fops_endpoint_stats); debugfs_create_file("fwlog", 0400, ar->debugfs_phy, ar, &fops_fwlog); debugfs_create_file("fwlog_block", 0400, ar->debugfs_phy, ar, &fops_fwlog_block); debugfs_create_file("fwlog_mask", 0600, ar->debugfs_phy, ar, &fops_fwlog_mask); debugfs_create_file("reg_addr", 0600, ar->debugfs_phy, ar, &fops_diag_reg_read); debugfs_create_file("reg_dump", 0400, ar->debugfs_phy, ar, &fops_reg_dump); debugfs_create_file("lrssi_roam_threshold", 0600, ar->debugfs_phy, ar, &fops_lrssi_roam_threshold); debugfs_create_file("reg_write", 0600, ar->debugfs_phy, ar, &fops_diag_reg_write); debugfs_create_file("war_stats", 0400, ar->debugfs_phy, ar, &fops_war_stats); debugfs_create_file("roam_table", 0400, ar->debugfs_phy, ar, &fops_roam_table); debugfs_create_file("force_roam", 0200, ar->debugfs_phy, ar, &fops_force_roam); debugfs_create_file("roam_mode", 0200, ar->debugfs_phy, ar, &fops_roam_mode); debugfs_create_file("keepalive", 0600, ar->debugfs_phy, ar, &fops_keepalive); debugfs_create_file("disconnect_timeout", 0600, ar->debugfs_phy, ar, &fops_disconnect_timeout); debugfs_create_file("create_qos", 0200, ar->debugfs_phy, ar, &fops_create_qos); debugfs_create_file("delete_qos", 0200, ar->debugfs_phy, ar, &fops_delete_qos); debugfs_create_file("bgscan_interval", 0200, ar->debugfs_phy, ar, &fops_bgscan_int); debugfs_create_file("listen_interval", 0600, ar->debugfs_phy, ar, &fops_listen_int); debugfs_create_file("power_params", 0200, ar->debugfs_phy, ar, &fops_power_params); return 0; } void ath6kl_debug_cleanup(struct ath6kl *ar) { skb_queue_purge(&ar->debug.fwlog_queue); complete(&ar->debug.fwlog_completion); kfree(ar->debug.roam_tbl); } #endif
3 2 1 1 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 // SPDX-License-Identifier: GPL-2.0-or-later /* System trusted keyring for trusted public keys * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/uidgid.h> #include <linux/verification.h> #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include <crypto/pkcs7.h> static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING static struct key *secondary_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING static struct key *machine_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING static struct key *platform_trusted_keys; #endif extern __initconst const u8 system_certificate_list[]; extern __initconst const unsigned long system_certificate_list_size; extern __initconst const unsigned long module_cert_size; /** * restrict_link_by_builtin_trusted - Restrict keyring addition by built-in CA * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. */ int restrict_link_by_builtin_trusted(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_signature(dest_keyring, type, payload, builtin_trusted_keys); } /** * restrict_link_by_digsig_builtin - Restrict digitalSignature key additions by the built-in keyring * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. The new key * must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_digsig(dest_keyring, type, payload, builtin_trusted_keys); } #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING /** * restrict_link_by_builtin_and_secondary_trusted - Restrict keyring * addition by both built-in and secondary keyrings. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. */ int restrict_link_by_builtin_and_secondary_trusted( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_signature(dest_keyring, type, payload, secondary_trusted_keys); } /** * restrict_link_by_digsig_builtin_and_secondary - Restrict by digitalSignature. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. The new key must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin_and_secondary(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_digsig(dest_keyring, type, payload, secondary_trusted_keys); } /* * Allocate a struct key_restriction for the "builtin and secondary trust" * keyring. Only for use in system_trusted_keyring_init(). */ static __init struct key_restriction *get_builtin_and_secondary_restriction(void) { struct key_restriction *restriction; restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!restriction) panic("Can't allocate secondary trusted keyring restriction\n"); if (IS_ENABLED(CONFIG_INTEGRITY_MACHINE_KEYRING)) restriction->check = restrict_link_by_builtin_secondary_and_machine; else restriction->check = restrict_link_by_builtin_and_secondary_trusted; return restriction; } /** * add_to_secondary_keyring - Add to secondary keyring. * @source: Source of key * @data: The blob holding the key * @len: The length of the data blob * * Add a key to the secondary keyring. The key must be vouched for by a key in the builtin, * machine or secondary keyring itself. */ void __init add_to_secondary_keyring(const char *source, const void *data, size_t len) { key_ref_t key; key_perm_t perm; perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW; key = key_create_or_update(make_key_ref(secondary_trusted_keys, 1), "asymmetric", NULL, data, len, perm, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { pr_err("Problem loading X.509 certificate from %s to secondary keyring %ld\n", source, PTR_ERR(key)); return; } pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); } #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING void __init set_machine_trusted_keys(struct key *keyring) { machine_trusted_keys = keyring; if (key_link(secondary_trusted_keys, machine_trusted_keys) < 0) panic("Can't link (machine) trusted keyrings\n"); } /** * restrict_link_by_builtin_secondary_and_machine - Restrict keyring addition. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in, the secondary, or * the machine keyrings. */ int restrict_link_by_builtin_secondary_and_machine( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { if (machine_trusted_keys && type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &machine_trusted_keys->payload) /* Allow the machine keyring to be added to the secondary */ return 0; return restrict_link_by_builtin_and_secondary_trusted(dest_keyring, type, payload, restrict_key); } #endif /* * Create the trusted keyrings */ static __init int system_trusted_keyring_init(void) { pr_notice("Initialise system trusted keyrings\n"); builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE), KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) panic("Can't allocate secondary trusted keyring\n"); if (key_link(secondary_trusted_keys, builtin_trusted_keys) < 0) panic("Can't link trusted keyrings\n"); #endif return 0; } /* * Must be initialised before we try and load the keys into the keyring. */ device_initcall(system_trusted_keyring_init); __init int load_module_cert(struct key *keyring) { if (!IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG)) return 0; pr_notice("Loading compiled-in module X.509 certificates\n"); return x509_load_certificate_list(system_certificate_list, module_cert_size, keyring); } /* * Load the compiled-in list of X.509 certificates. */ static __init int load_system_certificate_list(void) { const u8 *p; unsigned long size; pr_notice("Loading compiled-in X.509 certificates\n"); #ifdef CONFIG_MODULE_SIG p = system_certificate_list; size = system_certificate_list_size; #else p = system_certificate_list + module_cert_size; size = system_certificate_list_size - module_cert_size; #endif return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_message_sig(const void *data, size_t len, struct pkcs7_message *pkcs7, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { int ret; /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); ret = -EBADMSG; goto error; } ret = pkcs7_verify(pkcs7, usage); if (ret < 0) goto error; ret = is_key_on_revocation_list(pkcs7); if (ret != -ENOKEY) { pr_devel("PKCS#7 key is on revocation list\n"); goto error; } if (!trusted_keys) { trusted_keys = builtin_trusted_keys; } else if (trusted_keys == VERIFY_USE_SECONDARY_KEYRING) { #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING trusted_keys = secondary_trusted_keys; #else trusted_keys = builtin_trusted_keys; #endif } else if (trusted_keys == VERIFY_USE_PLATFORM_KEYRING) { #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING trusted_keys = platform_trusted_keys; #else trusted_keys = NULL; #endif if (!trusted_keys) { ret = -ENOKEY; pr_devel("PKCS#7 platform keyring is not available\n"); goto error; } } ret = pkcs7_validate_trust(pkcs7, trusted_keys); if (ret < 0) { if (ret == -ENOKEY) pr_devel("PKCS#7 signature not signed with a trusted key\n"); goto error; } if (view_content) { size_t asn1hdrlen; ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen); if (ret < 0) { if (ret == -ENODATA) pr_devel("PKCS#7 message does not contain data\n"); goto error; } ret = view_content(ctx, data, len, asn1hdrlen); } error: pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /** * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @raw_pkcs7: The PKCS#7 message that is the signature. * @pkcs7_len: The size of @raw_pkcs7. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { struct pkcs7_message *pkcs7; int ret; pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); if (IS_ERR(pkcs7))